#include "jnipointer.h"
#include "German.h"
#include "ltqnorm.h"
#include "GermanGPU.h"

#include <stdio.h>
#include <list>
#include <unistd.h>
#include <stdlib.h>
#include <stdbool.h>
#include <sys/time.h>
#include <math.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <curand.h>
#include <curand_kernel.h>
#include <ctime>
#include <string>
#include <iostream>

using namespace std;
#define epsilon 1e-9;//to adjust the precision of float number
#define RegPerThread 63//this value is obtained via compiling command

/** store PBN directly*/
 int n;
 int *nf;
 int *nv;
 int *F;
 int *varF;
float *cij;
float p;
int *g_positiveIndex;
int *g_negativeIndex;
int stateSize;
 string outputName;
 float precision;
 float confidence;
 float epsilon_twostate;
 int blockInfor[2];

static void HandleError(cudaError_t err, const char *file, int line) {
	if (err != cudaSuccess) {
		printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
		exit( EXIT_FAILURE);
	}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

__constant__ int powNum[2][32]; //used for adding shifNums, powNum[0][*]=0

extern __shared__ int arrays[];

/* this GPU kernel function is used to initialize the random states */
__global__ void init(int seed, curandState_t* states) {

	int idx = threadIdx.x + blockIdx.x * blockDim.x;

	//printf("idx %d", idx);

	/* we have to initialize the state */
	/* the seed can be the same for each core, here we pass the time in from the CPU */
	/*blockIdx.x, /* the sequence number should be different for each core (unless you want all
     cores to get the same sequence of numbers for some reason - use thread id! */
	/* the offset is how much extra we advance in the sequence for each call, can be 0 */
	curand_init(seed,
	idx,
	0,
	&states[idx]);
}

/*
 * check whether the given state belongs to meta state 0. Return true if the state belongs to meta state 0.
 * offset is related to the number of threads = #threads*n
 */
__device__ bool checkMetaStateInt(int stateSize, int* positiveIndex,
		int* negativeIndex, int* currentState, int offset) {

	//printf("stateSize=%d,positiveIndex[0]=%d,negativeIndex[0]=%d\n",stateSize,positiveIndex[0],negativeIndex[0]);
	for (int i = 0; i < stateSize; i++) {
		if (((positiveIndex[i] & currentState[i + offset]) ^ positiveIndex[i])
				!= 0) {
			return false;
		}
	}
	for (int i = 0; i < stateSize; i++) {
		if ((negativeIndex[i] & currentState[offset + i]) != 0)
			return false;
	}
	return true;
}

/*
 * check whether the given state belongs to meta state 0. Return true if the state belongs to meta state 0.
 * offset is related to the number of threads = #threads*n
 */
__device__ bool checkMetaStateKernel(int* positiveIndex, int* negativeIndex,
		int currentState, int offset) {
	if (((positiveIndex[offset] & currentState) ^ positiveIndex[offset]) != 0) {
		return false;
	}
	if ((negativeIndex[offset] & currentState) != 0)
		return false;
	return true;
}

__global__ void kernel(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, int* gpu_steps,
		int* gpu_positiveIndex, int* gpu_negativeIndex, long* gpu_stateA,
		long* gpu_stateB, int* gpu_transitionsLastChain, int* gpu_bridge,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local
	//int stateSize = *gpu_stateSize; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	int* negativeIndex = (int*) &positiveIndex[1];
	//int* initialState = (int*) &negativeIndex[stateSize];
	//int* initialStateCopy = (int*) &initialState[stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy, initialState;

	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
			//printf("%d\n", nf[i]);
		}
		cumNf[n] = gpu_cumNf[n];
		//printf("nv and F and cij and cumNv:\n");
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
			//printf("%d %d %f %d \n", nv[i], F[i], cij[i], cumNv[i]);
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}

		positiveIndex[0] = gpu_positiveIndex[0];
		negativeIndex[0] = gpu_negativeIndex[0];
	}

	__syncthreads();

	initialState = gpu_initialState[idx];
	initialStateCopy = initialState;

	int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	//printf("idx %d", idx);
	float rand;
	int relativeIndex;
	bool perturbation = false;
	//int nv_size = gpu_cumNf[n];
	int stateA = 0; //how many steps are in state A
	int stateB = 0;
	int transitions[2][2]; //maybe put this in shared memory to speed up
	for (int i = 0; i < 2; i++) {
		for (int j = 0; j < 2; j++) {
			transitions[i][j] = 0;
		}
	}
	int bridge, index1;
	bridge = gpu_bridge[idx];
	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << (i % 32)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			for (int i = 0; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					if (((initialState >> (varF[startVarFIndex + ind] % 32)) & 1)
							!= 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << (i % 32));
			}
		}
		//simulation finished
		//update initialState to the new state
		initialState = initialStateCopy;

		if (checkMetaStateKernel(positiveIndex, negativeIndex, initialState,
				0)) {
			stateA++;
			index1 = 1;
		} else {
			stateB++;
			index1 = 0;
		}
		transitions[bridge][index1]++;
		//need to update bridge for next time usage
		bridge = index1;
	}
	//update state
	states[idx] = localState;
	relativeIndex = idx;
	gpu_initialState[relativeIndex] = initialState;

	//copy local data to global data
	gpu_bridge[idx] = index1;
	gpu_stateA[idx] = stateA;
	gpu_stateB[idx] = stateB;

	relativeIndex = idx * 4;
	//printf("idx=%d,stateA=%d,stateB=%d\n",idx,stateA,stateB);
	gpu_transitionsLastChain[relativeIndex] = transitions[0][0];
	gpu_transitionsLastChain[relativeIndex + 1] = transitions[0][1];
	gpu_transitionsLastChain[relativeIndex + 2] = transitions[1][0];
	gpu_transitionsLastChain[relativeIndex + 3] = transitions[1][1];

}
/**
 * kernel for n=33-64
 */
__global__ void kernel2(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, int* gpu_steps,
		int* gpu_positiveIndex, int* gpu_negativeIndex, long* gpu_stateA,
		long* gpu_stateB, int* gpu_transitionsLastChain, int* gpu_bridge,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	int* negativeIndex = (int*) &positiveIndex[2];
	//int* initialState = (int*) &negativeIndex[stateSize];
	//int* initialStateCopy = (int*) &initialState[stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy, initialStateCopy2, initialState, initialState2;

	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
			//printf("%d\n", nf[i]);
		}
		cumNf[n] = gpu_cumNf[n];
		//printf("nv and F and cij and cumNv:\n");
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
			//printf("%d %d %f %d \n", nv[i], F[i], cij[i], cumNv[i]);
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}

		for (int i = 0; i < 2; i++) {
			positiveIndex[i] = gpu_positiveIndex[i];
		}
		for (int i = 0; i < 2; i++) {
			negativeIndex[i] = gpu_negativeIndex[i];
		}
	}

	__syncthreads();
	int relativeIndex = idx * 2;
	initialState = gpu_initialState[relativeIndex];
	initialState2 = gpu_initialState[relativeIndex + 1];
	initialStateCopy = initialState;
	initialStateCopy2 = initialState2;

	int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	//printf("idx %d", idx);
	float rand;

	bool perturbation = false;
	//int nv_size = gpu_cumNf[n];
	int stateA = 0; //how many steps are in state A
	int stateB = 0;
	int transitions[2][2]; //maybe put this in shared memory to speed up
	for (int i = 0; i < 2; i++) {
		for (int j = 0; j < 2; j++) {
			transitions[i][j] = 0;
		}
	}
	int bridge, index1;
	bridge = gpu_bridge[idx];
	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		for (int i = 0; i < 32; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << i); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 32; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy2 = initialStateCopy2 ^ (1 << (i - 32)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			for (int i = 0; i < 32; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << (i));
			}
			for (int i = 32; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy2 ^= (-(elementF & 1) ^ initialStateCopy2)
						& (1 << (i - 32));
			}

		}
		//simulation finished
		//update initialState to the new state
		initialState = initialStateCopy;
		initialState2 = initialStateCopy2;

		if (!checkMetaStateKernel(positiveIndex, negativeIndex, initialState, 0)
				|| !checkMetaStateKernel(positiveIndex, negativeIndex,
						initialState2, 1)) {
			stateB++;
			index1 = 0;
		} else {
			stateA++;
			index1 = 1;
		}
		transitions[bridge][index1]++;
		//need to update bridge for next time usage
		bridge = index1;
	}
	//update state
	states[idx] = localState;
	relativeIndex = 2 * idx;
	gpu_initialState[relativeIndex] = initialState;
	gpu_initialState[relativeIndex + 1] = initialState2;

	//copy local data to global data
	gpu_bridge[idx] = index1;
	gpu_stateA[idx] = stateA;
	gpu_stateB[idx] = stateB;

	relativeIndex = idx * 4;
	//printf("idx=%d,stateA=%d,stateB=%d\n",idx,stateA,stateB);
	gpu_transitionsLastChain[relativeIndex] = transitions[0][0];
	gpu_transitionsLastChain[relativeIndex + 1] = transitions[0][1];
	gpu_transitionsLastChain[relativeIndex + 2] = transitions[1][0];
	gpu_transitionsLastChain[relativeIndex + 3] = transitions[1][1];

}

/**
 * kernel for n=65-96
 */
__global__ void kernel3(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, int* gpu_steps,
		int* gpu_positiveIndex, int* gpu_negativeIndex, long* gpu_stateA,
		long* gpu_stateB, int* gpu_transitionsLastChain, int* gpu_bridge,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	int* negativeIndex = (int*) &positiveIndex[3];
	//int* initialState = (int*) &negativeIndex[stateSize];
	//int* initialStateCopy = (int*) &initialState[stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy, initialStateCopy2, initialStateCopy3, initialState,
			initialState2, initialState3;

	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
			//printf("%d\n", nf[i]);
		}
		cumNf[n] = gpu_cumNf[n];
		//printf("nv and F and cij and cumNv:\n");
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
			//printf("%d %d %f %d \n", nv[i], F[i], cij[i], cumNv[i]);
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}

		for (int i = 0; i < 3; i++) {
			positiveIndex[i] = gpu_positiveIndex[i];
		}
		for (int i = 0; i < 3; i++) {
			negativeIndex[i] = gpu_negativeIndex[i];
		}
	}

	__syncthreads();
	int relativeIndex = idx * 3;
	initialState = gpu_initialState[relativeIndex];
	initialState2 = gpu_initialState[relativeIndex + 1];
	initialState3 = gpu_initialState[relativeIndex + 2];
	initialStateCopy = initialState;
	initialStateCopy2 = initialState2;
	initialStateCopy3 = initialState3;

	int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	//printf("idx %d", idx);
	float rand;

	bool perturbation = false;
	//int nv_size = gpu_cumNf[n];
	int stateA = 0; //how many steps are in state A
	int stateB = 0;
	int transitions[2][2]; //maybe put this in shared memory to speed up
	for (int i = 0; i < 2; i++) {
		for (int j = 0; j < 2; j++) {
			transitions[i][j] = 0;
		}
	}
	int bridge, index1;
	bridge = gpu_bridge[idx];
	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		for (int i = 0; i < 32; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << i); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 32; i < 64; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy2 = initialStateCopy2 ^ (1 << (i - 32)); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 64; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy3 = initialStateCopy3 ^ (1 << (i - 64)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			for (int i = 0; i < 32; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << (i));
			}
			for (int i = 32; i < 64; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy2 ^= (-(elementF & 1) ^ initialStateCopy2)
						& (1 << (i - 32));
			}
			for (int i = 64; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy3 ^= (-(elementF & 1) ^ initialStateCopy3)
						& (1 << (i - 64));
			}
		}
		//simulation finished
		//update initialState to the new state
		initialState = initialStateCopy;
		initialState2 = initialStateCopy2;
		initialState3 = initialStateCopy3;

		if (!checkMetaStateKernel(positiveIndex, negativeIndex, initialState, 0)
				|| !checkMetaStateKernel(positiveIndex, negativeIndex,
						initialState2, 1)
				|| !checkMetaStateKernel(positiveIndex, negativeIndex,
						initialState3, 2)) {
			stateB++;
			index1 = 0;
		} else {
			stateA++;
			index1 = 1;
		}
		transitions[bridge][index1]++;
		//need to update bridge for next time usage
		bridge = index1;
	}
	//update state
	states[idx] = localState;
	relativeIndex = 3 * idx;
	gpu_initialState[relativeIndex] = initialState;
	gpu_initialState[relativeIndex + 1] = initialState2;
	gpu_initialState[relativeIndex + 2] = initialState3;

	//copy local data to global data
	gpu_bridge[idx] = index1;
	gpu_stateA[idx] = stateA;
	gpu_stateB[idx] = stateB;

	relativeIndex = idx * 4;
	//printf("idx=%d,stateA=%d,stateB=%d\n",idx,stateA,stateB);
	gpu_transitionsLastChain[relativeIndex] = transitions[0][0];
	gpu_transitionsLastChain[relativeIndex + 1] = transitions[0][1];
	gpu_transitionsLastChain[relativeIndex + 2] = transitions[1][0];
	gpu_transitionsLastChain[relativeIndex + 3] = transitions[1][1];

}

/**
 * kernel for n=97-128
 */
__global__ void kernel4(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, int* gpu_steps,
		int* gpu_positiveIndex, int* gpu_negativeIndex, long* gpu_stateA,
		long* gpu_stateB, int* gpu_transitionsLastChain, int* gpu_bridge,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	int* negativeIndex = (int*) &positiveIndex[4];
	//int* initialState = (int*) &negativeIndex[stateSize];
	//int* initialStateCopy = (int*) &initialState[stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy, initialStateCopy2, initialStateCopy3,
			initialStateCopy4;
	int initialState, initialState2, initialState3, initialState4;
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
			//printf("%d\n", nf[i]);
		}
		cumNf[n] = gpu_cumNf[n];
		//printf("nv and F and cij and cumNv:\n");
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
			//printf("%d %d %f %d \n", nv[i], F[i], cij[i], cumNv[i]);
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}

		for (int i = 0; i < 4; i++) {
			positiveIndex[i] = gpu_positiveIndex[i];
		}
		for (int i = 0; i < 4; i++) {
			negativeIndex[i] = gpu_negativeIndex[i];
		}
	}

	__syncthreads();

	int relativeIndex = idx * 4;
	initialState = gpu_initialState[relativeIndex];
	initialState2 = gpu_initialState[relativeIndex + 1];
	initialState3 = gpu_initialState[relativeIndex + 2];
	initialState4 = gpu_initialState[relativeIndex + 3];

	initialStateCopy = initialState;
	initialStateCopy2 = initialState2;
	initialStateCopy3 = initialState3;
	initialStateCopy4 = initialState4;

	int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	//printf("idx %d", idx);
	float rand;
	bool perturbation = false;
	//int nv_size = gpu_cumNf[n];
	int stateA = 0; //how many steps are in state A
	int stateB = 0;
	int transitions[2][2]; //maybe put this in shared memory to speed up
	for (int i = 0; i < 2; i++) {
		for (int j = 0; j < 2; j++) {
			transitions[i][j] = 0;
		}
	}
	int bridge, index1;
	bridge = gpu_bridge[idx];
	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		for (int i = 0; i < 32; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << i); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 32; i < 64; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy2 = initialStateCopy2 ^ (1 << (i - 32)); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 64; i < 96; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy3 = initialStateCopy3 ^ (1 << (i - 64)); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 96; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy4 = initialStateCopy4 ^ (1 << (i - 96)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			for (int i = 0; i < 32; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << i);
			}
			for (int i = 32; i < 64; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy2 ^= (-(elementF & 1) ^ initialStateCopy2)
						& (1 << (i - 32));
			}
			for (int i = 64; i < 96; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy3 ^= (-(elementF & 1) ^ initialStateCopy3)
						& (1 << (i - 64));
			}
			for (int i = 96; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy4 ^= (-(elementF & 1) ^ initialStateCopy4)
						& (1 << (i - 96));
			}
		}
		//simulation finished
		//update initialState to the new state
		initialState = initialStateCopy;
		initialState2 = initialStateCopy2;
		initialState3 = initialStateCopy3;
		initialState4 = initialStateCopy4;
		if (!checkMetaStateKernel(positiveIndex, negativeIndex, initialState, 0)
				|| !checkMetaStateKernel(positiveIndex, negativeIndex,
						initialState2, 1)
				|| !checkMetaStateKernel(positiveIndex, negativeIndex,
						initialState3, 2)
				|| !checkMetaStateKernel(positiveIndex, negativeIndex,
						initialState4, 3)) {
			stateB++;
			index1 = 0;

		} else {
			stateA++;
			index1 = 1;
		}
		transitions[bridge][index1]++;
		//need to update bridge for next time usage
		bridge = index1;
	}
	//update state
	states[idx] = localState;
	relativeIndex = 4 * idx;
	gpu_initialState[relativeIndex] = initialStateCopy;
	gpu_initialState[relativeIndex + 1] = initialStateCopy2;
	gpu_initialState[relativeIndex + 2] = initialStateCopy3;
	gpu_initialState[relativeIndex + 3] = initialStateCopy4;
	//copy local data to global data
	gpu_bridge[idx] = index1;
	gpu_stateA[idx] = stateA;
	gpu_stateB[idx] = stateB;

	//printf("idx=%d,stateA=%d,stateB=%d\n",idx,stateA,stateB);
	gpu_transitionsLastChain[relativeIndex] = transitions[0][0];
	gpu_transitionsLastChain[relativeIndex + 1] = transitions[0][1];
	gpu_transitionsLastChain[relativeIndex + 2] = transitions[1][0];
	gpu_transitionsLastChain[relativeIndex + 3] = transitions[1][1];
}
/**
 * kernel for n=129-160
 */
__global__ void kernel5(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, int* gpu_steps,
		int* gpu_positiveIndex, int* gpu_negativeIndex, long* gpu_stateA,
		long* gpu_stateB, int* gpu_transitionsLastChain, int* gpu_bridge,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	int* negativeIndex = (int*) &positiveIndex[5];
	//int* initialState = (int*) &negativeIndex[stateSize];
	//int* initialStateCopy = (int*) &initialState[stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy[5];
	int initialState[5];
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
			//printf("%d\n", nf[i]);
		}
		cumNf[n] = gpu_cumNf[n];
		//printf("nv and F and cij and cumNv:\n");
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
			//printf("%d %d %f %d \n", nv[i], F[i], cij[i], cumNv[i]);
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}

		for (int i = 0; i < 5; i++) {
			positiveIndex[i] = gpu_positiveIndex[i];
		}
		for (int i = 0; i < 5; i++) {
			negativeIndex[i] = gpu_negativeIndex[i];
		}
	}

	__syncthreads();

	int relativeIndex = idx * 5;
	for(int i=0;i<5;i++){
		initialState[i]=gpu_initialState[relativeIndex+i];
		initialStateCopy[i]=initialState[i];
	}

	int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	//printf("idx %d", idx);
	float rand;
	bool perturbation = false;
	//int nv_size = gpu_cumNf[n];
	int stateA = 0; //how many steps are in state A
	int stateB = 0;
	int transitions[2][2]; //maybe put this in shared memory to speed up
	for (int i = 0; i < 2; i++) {
		for (int j = 0; j < 2; j++) {
			transitions[i][j] = 0;
		}
	}
	int bridge, index1;
	bridge = gpu_bridge[idx];
	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32

			}
		}
		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex=initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;
				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
														& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		for(int i=0;i<5;i++){
			initialState[i]=initialStateCopy[i];
		}
		relativeIndex=0;
		while(relativeIndex<5){
			if(!checkMetaStateKernel(positiveIndex, negativeIndex, initialState[relativeIndex], relativeIndex)){
				relativeIndex=1000;
				stateB++;
				index1 = 0;
			}
			relativeIndex++;
		}
		if(relativeIndex==5){
			stateA++;
			index1 = 1;
		}
		transitions[bridge][index1]++;
		//need to update bridge for next time usage
		bridge = index1;
	}
	//update state
	states[idx] = localState;
	relativeIndex = 5 * idx;
	for(int i=0;i<5;i++)
		gpu_initialState[relativeIndex+i] = initialStateCopy[i];
	//copy local data to global data
	gpu_bridge[idx] = index1;
	gpu_stateA[idx] = stateA;
	gpu_stateB[idx] = stateB;

	//printf("idx=%d,stateA=%d,stateB=%d\n",idx,stateA,stateB);
	gpu_transitionsLastChain[relativeIndex] = transitions[0][0];
	gpu_transitionsLastChain[relativeIndex + 1] = transitions[0][1];
	gpu_transitionsLastChain[relativeIndex + 2] = transitions[1][0];
	gpu_transitionsLastChain[relativeIndex + 3] = transitions[1][1];
}
/**
 * kernel for n=161-192
 */
__global__ void kernel6(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, int* gpu_steps,
		int* gpu_positiveIndex, int* gpu_negativeIndex, long* gpu_stateA,
		long* gpu_stateB, int* gpu_transitionsLastChain, int* gpu_bridge,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	int* negativeIndex = (int*) &positiveIndex[6];
	//int* initialState = (int*) &negativeIndex[stateSize];
	//int* initialStateCopy = (int*) &initialState[stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy[6];
	int initialState[6];
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
			//printf("%d\n", nf[i]);
		}
		cumNf[n] = gpu_cumNf[n];
		//printf("nv and F and cij and cumNv:\n");
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
			//printf("%d %d %f %d \n", nv[i], F[i], cij[i], cumNv[i]);
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}

		for (int i = 0; i < 6; i++) {
			positiveIndex[i] = gpu_positiveIndex[i];
		}
		for (int i = 0; i < 6; i++) {
			negativeIndex[i] = gpu_negativeIndex[i];
		}
	}

	__syncthreads();

	int relativeIndex = idx * 6;
	for(int i=0;i<6;i++){
		initialState[i]=gpu_initialState[relativeIndex+i];
		initialStateCopy[i]=initialState[i];
	}

	int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	//printf("idx %d", idx);
	float rand;
	bool perturbation = false;
	//int nv_size = gpu_cumNf[n];
	int stateA = 0; //how many steps are in state A
	int stateB = 0;
	int transitions[2][2]; //maybe put this in shared memory to speed up
	for (int i = 0; i < 2; i++) {
		for (int j = 0; j < 2; j++) {
			transitions[i][j] = 0;
		}
	}
	int bridge, index1;
	bridge = gpu_bridge[idx];
	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32

			}
		}
		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex=initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;
				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
														& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		for(int i=0;i<6;i++){
			initialState[i]=initialStateCopy[i];
		}
		relativeIndex=0;
		while(relativeIndex<6){
			if(!checkMetaStateKernel(positiveIndex, negativeIndex, initialState[relativeIndex], relativeIndex)){
				relativeIndex=1000;
				stateB++;
				index1 = 0;
			}
			relativeIndex++;
		}
		if(relativeIndex==6){
			stateA++;
			index1 = 1;
		}
		transitions[bridge][index1]++;
		//need to update bridge for next time usage
		bridge = index1;
	}
	//update state
	states[idx] = localState;
	relativeIndex = 6 * idx;
	for(int i=0;i<6;i++)
		gpu_initialState[relativeIndex+i] = initialStateCopy[i];
	//copy local data to global data
	gpu_bridge[idx] = index1;
	gpu_stateA[idx] = stateA;
	gpu_stateB[idx] = stateB;

	//printf("idx=%d,stateA=%d,stateB=%d\n",idx,stateA,stateB);
	gpu_transitionsLastChain[relativeIndex] = transitions[0][0];
	gpu_transitionsLastChain[relativeIndex + 1] = transitions[0][1];
	gpu_transitionsLastChain[relativeIndex + 2] = transitions[1][0];
	gpu_transitionsLastChain[relativeIndex + 3] = transitions[1][1];
}
/**
 * kernel for n=193-512, maximum integer 16
 */
__global__ void kernel7(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, int* gpu_steps,
		int* gpu_positiveIndex, int* gpu_negativeIndex, long* gpu_stateA,
		long* gpu_stateB, int* gpu_transitionsLastChain, int* gpu_bridge,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local
	int stateSize=*gpu_stateSize;
	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	int* negativeIndex = (int*) &positiveIndex[stateSize];
	//int* initialState = (int*) &negativeIndex[stateSize];
	//int* initialStateCopy = (int*) &initialState[stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy[16];
	int initialState[16];
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
			//printf("%d\n", nf[i]);
		}
		cumNf[n] = gpu_cumNf[n];
		//printf("nv and F and cij and cumNv:\n");
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
			//printf("%d %d %f %d \n", nv[i], F[i], cij[i], cumNv[i]);
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}

		for (int i = 0; i < stateSize; i++) {
			positiveIndex[i] = gpu_positiveIndex[i];
		}
		for (int i = 0; i < stateSize; i++) {
			negativeIndex[i] = gpu_negativeIndex[i];
		}
	}

	__syncthreads();


	int relativeIndex = idx * stateSize;
	for(int i=0;i<stateSize;i++){
		initialState[i]=gpu_initialState[relativeIndex+i];
		initialStateCopy[i]=initialState[i];
	}

	int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	//printf("idx %d", idx);
	float rand;
	bool perturbation = false;
	//int nv_size = gpu_cumNf[n];
	int stateA = 0; //how many steps are in state A
	int stateB = 0;
	int transitions[2][2]; //maybe put this in shared memory to speed up
	for (int i = 0; i < 2; i++) {
		for (int j = 0; j < 2; j++) {
			transitions[i][j] = 0;
		}
	}
	int bridge, index1;
	bridge = gpu_bridge[idx];
	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32

			}
		}
		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex=initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;
				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
														& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		for(int i=0;i<stateSize;i++){
			initialState[i]=initialStateCopy[i];
		}
		relativeIndex=0;
		while(relativeIndex<stateSize){
			if(!checkMetaStateKernel(positiveIndex, negativeIndex, initialState[relativeIndex], relativeIndex)){
				relativeIndex=1000;
				stateB++;
				index1 = 0;
			}
			relativeIndex++;
		}
		if(relativeIndex==stateSize){
			stateA++;
			index1 = 1;
		}
		transitions[bridge][index1]++;
		//need to update bridge for next time usage
		bridge = index1;
	}
	//update state
	states[idx] = localState;
	relativeIndex = stateSize * idx;
	for(int i=0;i<stateSize;i++)
		gpu_initialState[relativeIndex+i] = initialStateCopy[i];
	//copy local data to global data
	gpu_bridge[idx] = index1;
	gpu_stateA[idx] = stateA;
	gpu_stateB[idx] = stateB;

	//printf("idx=%d,stateA=%d,stateB=%d\n",idx,stateA,stateB);
	gpu_transitionsLastChain[relativeIndex] = transitions[0][0];
	gpu_transitionsLastChain[relativeIndex + 1] = transitions[0][1];
	gpu_transitionsLastChain[relativeIndex + 2] = transitions[1][0];
	gpu_transitionsLastChain[relativeIndex + 3] = transitions[1][1];
}
/**
 * kernel for n=513-2048, maximum integer 64
 */
__global__ void kernel8(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, int* gpu_steps,
		int* gpu_positiveIndex, int* gpu_negativeIndex, long* gpu_stateA,
		long* gpu_stateB, int* gpu_transitionsLastChain, int* gpu_bridge,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local
	int stateSize=*gpu_stateSize;
	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	int* negativeIndex = (int*) &positiveIndex[stateSize];
	//int* initialState = (int*) &negativeIndex[stateSize];
	//int* initialStateCopy = (int*) &initialState[stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy[64];
	int initialState[64];
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
			//printf("%d\n", nf[i]);
		}
		cumNf[n] = gpu_cumNf[n];
		//printf("nv and F and cij and cumNv:\n");
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
			//printf("%d %d %f %d \n", nv[i], F[i], cij[i], cumNv[i]);
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}

		for (int i = 0; i < stateSize; i++) {
			positiveIndex[i] = gpu_positiveIndex[i];
		}
		for (int i = 0; i < stateSize; i++) {
			negativeIndex[i] = gpu_negativeIndex[i];
		}
	}

	__syncthreads();


	int relativeIndex = idx * stateSize;
	for(int i=0;i<stateSize;i++){
		initialState[i]=gpu_initialState[relativeIndex+i];
		initialStateCopy[i]=initialState[i];
	}

	int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	//printf("idx %d", idx);
	float rand;
	bool perturbation = false;
	//int nv_size = gpu_cumNf[n];
	int stateA = 0; //how many steps are in state A
	int stateB = 0;
	int transitions[2][2]; //maybe put this in shared memory to speed up
	for (int i = 0; i < 2; i++) {
		for (int j = 0; j < 2; j++) {
			transitions[i][j] = 0;
		}
	}
	int bridge, index1;
	bridge = gpu_bridge[idx];
	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32

			}
		}
		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex=initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;
				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
														& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		for(int i=0;i<stateSize;i++){
			initialState[i]=initialStateCopy[i];
		}
		relativeIndex=0;
		while(relativeIndex<stateSize){
			if(!checkMetaStateKernel(positiveIndex, negativeIndex, initialState[relativeIndex], relativeIndex)){
				relativeIndex=1000;
				stateB++;
				index1 = 0;
			}
			relativeIndex++;
		}
		if(relativeIndex==stateSize){
			stateA++;
			index1 = 1;
		}
		transitions[bridge][index1]++;
		//need to update bridge for next time usage
		bridge = index1;
	}
	//update state
	states[idx] = localState;
	relativeIndex = stateSize * idx;
	for(int i=0;i<stateSize;i++)
		gpu_initialState[relativeIndex+i] = initialStateCopy[i];
	//copy local data to global data
	gpu_bridge[idx] = index1;
	gpu_stateA[idx] = stateA;
	gpu_stateB[idx] = stateB;

	//printf("idx=%d,stateA=%d,stateB=%d\n",idx,stateA,stateB);
	gpu_transitionsLastChain[relativeIndex] = transitions[0][0];
	gpu_transitionsLastChain[relativeIndex + 1] = transitions[0][1];
	gpu_transitionsLastChain[relativeIndex + 2] = transitions[1][0];
	gpu_transitionsLastChain[relativeIndex + 3] = transitions[1][1];
}
/*
 * run converge
 * gpu_currentTrajectorySize the current trajectory size, initially it is 0
 * gpu_steps how many steps to be simulated in this call*/

__global__ void kernelConverge(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, float* gpu_mean,
		float* gpu_trajectory, int* gpu_trajectoryKernel, int* gpu_steps,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local
 //if(idx==0)printf("gpu_statesize=%d\n",*gpu_stateSize);
	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	//int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	//int* negativeIndex = (int*) &positiveIndex[*gpu_stateSize];
	//int* initialState = (int*) &negativeIndex[*gpu_stateSize];
	//int* initialStateCopy = (int*) &initialState[*gpu_stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy, initialState;
	//int offset = *gpu_stateSize * blockDim.x * blockIdx.x;

	//printf("Finish outputting initial states!\n");
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
			//printf("nv[%d]=%d\t",i,nv[i]);
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}
	}

	__syncthreads();
	initialState = gpu_initialState[idx];
	initialStateCopy = initialState;
	int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	int relativeIndex;
	bool perturbation = false;
	//offset = idx * *gpu_stateSize;
	int prefix = gridDim.x * blockDim.x;
	int offset2 = -prefix;//gpu_currentTrajectorySize is the current trajectory size
	int prefix2 = *gpu_stateSize * prefix;
	int offset4 = idx - prefix2;

	float result = 0;
	float mean = 0;

	for (int j = 0; j < steps; j++) {
		result = 0;
		perturbation = false;
		offset2 += prefix;
		offset4 += prefix2;
		//check perturbation
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			if (rand < p) {
				perturbation = true;
				relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << (i % 32)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			for (int i = 0; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					if (((initialState >> (varF[startVarFIndex + ind] % 32)) & 1)
							!= 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << (i % 32));
			}
		}
		//if(idx==0)
		//	printf("state %d=%d\n",j,initialStateCopy);
		//simulation finished
		//update initialState to the new state
		float times = 1;
		initialState = initialStateCopy;
		gpu_trajectoryKernel[offset4] = initialStateCopy;
		result += ((unsigned int) initialState) * times;
		times *= (float) powNum[1][31] * 2;
		mean += result;
		gpu_trajectory[idx + offset2] = result;
	}

	mean = mean / steps;
	gpu_mean[idx * 2] = mean;
	offset2 = -prefix;
	float minu;
	float variance = 0;
	for (int i = 0; i < steps; i++) {
		offset2 += prefix;
		minu = gpu_trajectory[idx + offset2] - mean;
		variance += minu * minu;
	}
	variance = variance / (steps - 1);
	gpu_mean[idx * 2 + 1] = variance;
	//update state
	states[idx] = localState;
	gpu_initialState[idx] = initialState;
	//printf("idx=%d", idx);
}
/*
 * run converge for n=33-64
 * gpu_currentTrajectorySize the current trajectory size, initially it is 0
 * gpu_steps how many steps to be simulated in this call*/

__global__ void kernelConverge2(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, float* gpu_mean,
		float* gpu_trajectory, int* gpu_trajectoryKernel, int* gpu_steps,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	//int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	//int* negativeIndex = (int*) &positiveIndex[*gpu_stateSize];
	//int* initialState = (int*) &negativeIndex[*gpu_stateSize];
	//int* initialStateCopy = (int*) &initialState[*gpu_stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy, initialStateCopy2, initialState, initialState2;
	//int offset = *gpu_stateSize * blockDim.x * blockIdx.x;

	//printf("Finish outputting initial states!\n");
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}
	}

	__syncthreads();

	int relativeIndex = idx * 2;
	initialState = gpu_initialState[relativeIndex];
	initialState2 = gpu_initialState[relativeIndex + 1];
	initialStateCopy = initialState;
	initialStateCopy2 = initialState2;
	int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;

	bool perturbation = false;
	//offset = idx * *gpu_stateSize;
	int prefix = gridDim.x * blockDim.x;
	int offset2 = -prefix;//gpu_currentTrajectorySize is the current trajectory size
	int prefix2 = *gpu_stateSize * prefix;
	int offset4 = relativeIndex - prefix2;

	float result = 0;
	float mean = 0;

	for (int j = 0; j < steps; j++) {
		result = 0;
		perturbation = false;
		offset2 += prefix;
		offset4 += prefix2;
		//check perturbation
		for (int i = 0; i < 32; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << i); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 32; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy2 = initialStateCopy2 ^ (1 << (i - 32)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (false&&!perturbation) {
			//printf("in kernelConverg2, perturbation=false, execute.\n");
			int elementF, startVarFIndex, resultStateSize, shifNum;
			for (int i = 0; i < 32; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << (i));
			}
			for (int i = 32; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy2 ^= (-(elementF & 1) ^ initialStateCopy2)
						& (1 << (i - 32));
			}
		}
		//simulation finished
		//update initialState to the new state
		float times = 1;

		initialState = initialStateCopy;
		gpu_trajectoryKernel[offset4] = initialStateCopy;
		result += ((unsigned int) initialState) * times;
		times *= (float) powNum[1][31] * 2;

		initialState2 = initialStateCopy2;
		gpu_trajectoryKernel[offset4 + 1] = initialStateCopy2;
		result += ((unsigned int) initialState2) * times;
		times *= (float) powNum[1][31] * 2;

		mean += result;
		gpu_trajectory[idx + offset2] = result;
	}

	mean = mean / steps;
	gpu_mean[idx * 2] = mean;
	offset2 = -prefix;
	float minu;
	float variance = 0;
	for (int i = 0; i < steps; i++) {
		offset2 += prefix;
		minu = gpu_trajectory[idx + offset2] - mean;
		variance += minu * minu;
	}
	variance = variance / (steps - 1);
	gpu_mean[idx * 2 + 1] = variance;
	//update state
	states[idx] = localState;
	relativeIndex = *gpu_stateSize * idx;
	gpu_initialState[relativeIndex] = initialState;
	gpu_initialState[relativeIndex + 1] = initialState2;
	//printf("idx=%d", idx);
}
/**
 * for n=65-96
 */
__global__ void kernelConverge3(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, float* gpu_mean,
		float* gpu_trajectory, int* gpu_trajectoryKernel, int* gpu_steps,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	//int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	//int* negativeIndex = (int*) &positiveIndex[*gpu_stateSize];
	//int* initialState = (int*) &negativeIndex[*gpu_stateSize];
	//int* initialStateCopy = (int*) &initialState[*gpu_stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy, initialStateCopy2, initialStateCopy3, initialState,
			initialState2, initialState3;
	//int offset = *gpu_stateSize * blockDim.x * blockIdx.x;

	//printf("Finish outputting initial states!\n");
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}

	}

	__syncthreads();
	int relativeIndex = idx * 3;
	initialState = gpu_initialState[relativeIndex];
	initialState2 = gpu_initialState[relativeIndex + 1];
	initialState3 = gpu_initialState[relativeIndex + 2];
	initialStateCopy = initialState;
	initialStateCopy2 = initialState2;
	initialStateCopy3 = initialState3;
	//int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;

	bool perturbation = false;
	//offset = idx * *gpu_stateSize;
	int prefix = gridDim.x * blockDim.x;
	int offset2 = -prefix;//gpu_currentTrajectorySize is the current trajectory size
	int prefix2 = *gpu_stateSize * prefix;
	int offset4 = relativeIndex - prefix2;

	float result = 0;
	float mean = 0;
	int elementF, startVarFIndex, resultStateSize, shifNum;
	for (int j = 0; j < *gpu_steps; j++) {
		result = 0;
		perturbation = false;
		offset2 += prefix;
		offset4 += prefix2;
		//check perturbation
		for (int i = 0; i < 32; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << i); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 32; i < 64; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy2 = initialStateCopy2 ^ (1 << (i - 32)); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 64; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy3 = initialStateCopy3 ^ (1 << (i - 64)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {

			for (int i = 0; i < 32; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << (i));
			}
			for (int i = 32; i < 64; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy2 ^= (-(elementF & 1) ^ initialStateCopy2)
						& (1 << (i - 32));
			}
			for (int i = 64; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy3 ^= (-(elementF & 1) ^ initialStateCopy3)
						& (1 << (i - 64));
			}
		}
		//simulation finished
		//update initialState to the new state
		float times = 1;

		initialState = initialStateCopy;
		gpu_trajectoryKernel[offset4] = initialStateCopy;
		result += ((unsigned int) initialState) * times;
		times *= (float) powNum[1][31] * 2;

		initialState2 = initialStateCopy2;
		gpu_trajectoryKernel[offset4 + 1] = initialStateCopy2;
		result += ((unsigned int) initialState2) * times;
		times *= (float) powNum[1][31] * 2;

		initialState3 = initialStateCopy3;
		gpu_trajectoryKernel[offset4 + 2] = initialStateCopy3;
		result += ((unsigned int) initialState3) * times;
		times *= (float) powNum[1][31] * 2;

		mean += result;
		gpu_trajectory[idx + offset2] = result;
	}
	prefix2 = *gpu_steps;		//prefix2 value changed
	mean = mean / prefix2;
	gpu_mean[idx * 2] = mean;
	offset2 = -prefix;
	float minu;
	float variance = 0;
	for (int i = 0; i < prefix2; i++) {
		offset2 += prefix;
		minu = gpu_trajectory[idx + offset2] - mean;
		variance += minu * minu;
	}
	variance = variance / (prefix2 - 1);
	gpu_mean[idx * 2 + 1] = variance;
	//update state
	states[idx] = localState;
	relativeIndex = 3 * idx;
	gpu_initialState[relativeIndex] = initialState;
	gpu_initialState[relativeIndex + 1] = initialState2;
	gpu_initialState[relativeIndex + 2] = initialState3;
	//printf("idx=%d", idx);
}
/**
 * for n=97-128
 */
__global__ void kernelConverge4(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, float* gpu_mean,
		float* gpu_trajectory, int* gpu_trajectoryKernel, int* gpu_steps,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	//int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	//int* negativeIndex = (int*) &positiveIndex[*gpu_stateSize];
	//int* initialState = (int*) &negativeIndex[*gpu_stateSize];
	//int* initialStateCopy = (int*) &initialState[*gpu_stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p;
	int initialStateCopy, initialStateCopy2, initialStateCopy3,
			initialStateCopy4;
	int initialState, initialState2, initialState3, initialState4;

	//printf("Finish outputting initial states!\n");
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}
	}

	__syncthreads();
	int relativeIndex = idx * 4;
	initialState = gpu_initialState[relativeIndex];
	initialState2 = gpu_initialState[relativeIndex + 1];
	initialState3 = gpu_initialState[relativeIndex + 2];
	initialState4 = gpu_initialState[relativeIndex + 3];

	initialStateCopy = initialState;
	initialStateCopy2 = initialState2;
	initialStateCopy3 = initialState3;
	initialStateCopy4 = initialState4;
	//int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	bool perturbation = false;
	int prefix = gridDim.x * blockDim.x;
	int offset2 = -prefix;//gpu_currentTrajectorySize is the current trajectory size
	int prefix2 = 4 * prefix;
	int offset4 = relativeIndex - prefix2;

	float result = 0;
	float mean = 0;
	int elementF, startVarFIndex, resultStateSize, shifNum;
	for (int j = 0; j < *gpu_steps; j++) {
		result = 0;
		perturbation = false;
		offset2 += prefix;
		offset4 += prefix2;
		//check perturbation
		for (int i = 0; i < 32; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << i); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 32; i < 64; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy2 = initialStateCopy2 ^ (1 << (i - 32)); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 64; i < 96; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy3 = initialStateCopy3 ^ (1 << (i - 64)); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 96; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy4 = initialStateCopy4 ^ (1 << (i - 96)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			for (int i = 0; i < 32; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << i);
			}
			for (int i = 32; i < 64; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy2 ^= (-(elementF & 1) ^ initialStateCopy2)
						& (1 << (i - 32));
			}
			for (int i = 64; i < 96; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy3 ^= (-(elementF & 1) ^ initialStateCopy3)
						& (1 << (i - 64));
			}
			for (int i = 96; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy4 ^= (-(elementF & 1) ^ initialStateCopy4)
						& (1 << (i - 96));
			}
		}
		//simulation finished
		//update initialState to the new state
		float times = 1;

		initialState = initialStateCopy;
		gpu_trajectoryKernel[offset4] = initialStateCopy;
		result += ((unsigned int) initialState) * times;
		times *= (float) powNum[1][31] * 2;

		initialState2 = initialStateCopy2;
		gpu_trajectoryKernel[offset4 + 1] = initialStateCopy2;
		result += ((unsigned int) initialState2) * times;
		times *= (float) powNum[1][31] * 2;

		initialState3 = initialStateCopy3;
		gpu_trajectoryKernel[offset4 + 2] = initialStateCopy3;
		result += ((unsigned int) initialState3) * times;
		times *= (float) powNum[1][31] * 2;

		initialState4 = initialStateCopy4;
		gpu_trajectoryKernel[offset4 + 3] = initialStateCopy4;
		result += ((unsigned int) initialState4) * times;
		times *= (float) powNum[1][31] * 2;
		mean += result;
		gpu_trajectory[idx + offset2] = result;
	}
	prefix2 = *gpu_steps;		//prefix2 value changed
	mean = mean / prefix2;
	gpu_mean[idx * 2] = mean;
	offset2 = -prefix;
	float minu;
	float variance = 0;
	for (int i = 0; i < prefix2; i++) {
		offset2 += prefix;
		minu = gpu_trajectory[idx + offset2] - mean;
		variance += minu * minu;
	}
	variance = variance / (prefix2 - 1);
	gpu_mean[idx * 2 + 1] = variance;
	//update state
	states[idx] = localState;
	relativeIndex = 4 * idx;
	gpu_initialState[relativeIndex] = initialState;
	gpu_initialState[relativeIndex + 1] = initialState2;
	gpu_initialState[relativeIndex + 2] = initialState3;
	gpu_initialState[relativeIndex + 3] = initialState4;
	//printf("idx=%d", idx);
}
/**
 * for n=129-160
 */
__global__ void kernelConverge5(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, float* gpu_mean,
		float* gpu_trajectory, int* gpu_trajectoryKernel, int* gpu_steps,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	float p = *gpu_p;
	int initialStateCopy[5];
	int initialState[5];

	//printf("Finish outputting initial states!\n");
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}
	}

	__syncthreads();
	int relativeIndex = idx * 5;
	initialState[0] = gpu_initialState[relativeIndex];
	initialState[1] = gpu_initialState[relativeIndex + 1];
	initialState[2] = gpu_initialState[relativeIndex + 2];
	initialState[3] = gpu_initialState[relativeIndex + 3];
	initialState[4] = gpu_initialState[relativeIndex + 4];

	initialStateCopy[0] = initialState[0];
	initialStateCopy[1] = initialState[1];
	initialStateCopy[2] = initialState[2];
	initialStateCopy[3] = initialState[3];
	initialStateCopy[4] = initialState[4];
	//int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	bool perturbation = false;
	int prefix = gridDim.x * blockDim.x;
	int offset2 = -prefix;//gpu_currentTrajectorySize is the current trajectory size
	int prefix2 = *gpu_stateSize * prefix;
	int offset4 = relativeIndex - prefix2;

	float result = 0;
	float mean = 0;
	int elementF, startVarFIndex, resultStateSize, shifNum;
	for (int j = 0; j < *gpu_steps; j++) {
		result = 0;
		perturbation = false;
		offset2 += prefix;
		offset4 += prefix2;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex=initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
										& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		float times = 1;

		initialState[0] = initialStateCopy[0];
		gpu_trajectoryKernel[offset4] = initialStateCopy[0];
		result += ((unsigned int) initialState[0]) * times;
		times *= (float) powNum[1][31] * 2;

		initialState[1] = initialStateCopy[1];
		gpu_trajectoryKernel[offset4 + 1] = initialStateCopy[1];
		result += ((unsigned int) initialState[1]) * times;
		times *= (float) powNum[1][31] * 2;

		initialState[2] = initialStateCopy[2];
		gpu_trajectoryKernel[offset4 + 2] = initialStateCopy[2];
		result += ((unsigned int) initialState[2]) * times;
		times *= (float) powNum[1][31] * 2;

		initialState[3] = initialStateCopy[3];
		gpu_trajectoryKernel[offset4 + 3] = initialStateCopy[3];
		result += ((unsigned int) initialState[3]) * times;
		times *= (float) powNum[1][31] * 2;

		initialState[4] = initialStateCopy[4];
		gpu_trajectoryKernel[offset4 + 4] = initialStateCopy[4];
		result += ((unsigned int) initialState[4]) * times;
		times *= (float) powNum[1][31] * 2;
		mean += result;
		gpu_trajectory[idx + offset2] = result;
	}
	prefix2 = *gpu_steps;		//prefix2 value changed
	mean = mean / prefix2;
	gpu_mean[idx * 2] = mean;
	offset2 = -prefix;
	float minu;
	float variance = 0;
	for (int i = 0; i < prefix2; i++) {
		offset2 += prefix;
		minu = gpu_trajectory[idx + offset2] - mean;
		variance += minu * minu;
	}
	variance = variance / (prefix2 - 1);
	gpu_mean[idx * 2 + 1] = variance;
	//update state
	states[idx] = localState;
	relativeIndex = 5 * idx;
	gpu_initialState[relativeIndex] = initialState[0];
	gpu_initialState[relativeIndex + 1] = initialState[1];
	gpu_initialState[relativeIndex + 2] = initialState[2];
	gpu_initialState[relativeIndex + 3] = initialState[3];
	gpu_initialState[relativeIndex + 4] = initialState[4];
}
/**
 * for n=161-192
 */
__global__ void kernelConverge6(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, float* gpu_mean,
		float* gpu_trajectory, int* gpu_trajectoryKernel, int* gpu_steps,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	float p = *gpu_p;
	int initialStateCopy[6];
	int initialState[6];

	//printf("Finish outputting initial states!\n");
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}
	}

	__syncthreads();
	int relativeIndex = idx * 6;
	for(int i=0;i<6;i++){
		initialState[i] = gpu_initialState[relativeIndex+i];
		initialStateCopy[i] = initialState[i];
	}

	//int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	bool perturbation = false;
	int prefix = gridDim.x * blockDim.x;
	int offset2 = -prefix;//gpu_currentTrajectorySize is the current trajectory size
	int prefix2 = *gpu_stateSize * prefix;
	int offset4 = relativeIndex - prefix2;

	float result = 0;
	float mean = 0;
	int elementF, startVarFIndex, resultStateSize, shifNum;
	for (int j = 0; j < *gpu_steps; j++) {
		result = 0;
		perturbation = false;
		offset2 += prefix;
		offset4 += prefix2;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex=initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
										& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		float times = 1;

		for(int i=0;i<6;i++){
			initialState[i] = initialStateCopy[i];
			gpu_trajectoryKernel[offset4+i] = initialStateCopy[i];
			result += ((unsigned int) initialState[i]) * times;
			times *= (float) powNum[1][31] * 2;
		}
		mean += result;
		gpu_trajectory[idx + offset2] = result;
	}
	prefix2 = *gpu_steps;		//prefix2 value changed
	mean = mean / prefix2;
	gpu_mean[idx * 2] = mean;
	offset2 = -prefix;
	float minu;
	float variance = 0;
	for (int i = 0; i < prefix2; i++) {
		offset2 += prefix;
		minu = gpu_trajectory[idx + offset2] - mean;
		variance += minu * minu;
	}
	variance = variance / (prefix2 - 1);
	gpu_mean[idx * 2 + 1] = variance;
	//update state
	states[idx] = localState;
	relativeIndex = 6 * idx;
	for(int i=0;i<6;i++){
		gpu_initialState[relativeIndex+i] = initialState[i];
	}
}
/**
 * for n=193-512, maximum 16 integers
 */
__global__ void kernelConverge7(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, float* gpu_mean,
		float* gpu_trajectory, int* gpu_trajectoryKernel, int* gpu_steps,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	float p = *gpu_p;
	int initialStateCopy[16];
	int initialState[16];

	//printf("Finish outputting initial states!\n");
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}
	}

	__syncthreads();
	int relativeIndex = idx * *gpu_stateSize;
	for(int i=0;i<*gpu_stateSize;i++){
		initialState[i] = gpu_initialState[relativeIndex+i];
		initialStateCopy[i] = initialState[i];
	}

	//int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	bool perturbation = false;
	int prefix = gridDim.x * blockDim.x;
	int offset2 = -prefix;//gpu_currentTrajectorySize is the current trajectory size
	int prefix2 = *gpu_stateSize * prefix;
	int offset4 = relativeIndex - prefix2;

	float result = 0;
	float mean = 0;
	int elementF, startVarFIndex, resultStateSize, shifNum;
	for (int j = 0; j < *gpu_steps; j++) {
		result = 0;
		perturbation = false;
		offset2 += prefix;
		offset4 += prefix2;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex=initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
										& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		float times = 1;

		for(int i=0;i<*gpu_stateSize;i++){
			initialState[i] = initialStateCopy[i];
			gpu_trajectoryKernel[offset4+i] = initialStateCopy[i];
			result += ((unsigned int) initialState[i]) * times;
			times *= (float) powNum[1][31] * 2;
		}
		mean += result;
		gpu_trajectory[idx + offset2] = result;
	}
	prefix2 = *gpu_steps;		//prefix2 value changed
	mean = mean / prefix2;
	gpu_mean[idx * 2] = mean;
	offset2 = -prefix;
	float minu;
	float variance = 0;
	for (int i = 0; i < prefix2; i++) {
		offset2 += prefix;
		minu = gpu_trajectory[idx + offset2] - mean;
		variance += minu * minu;
	}
	variance = variance / (prefix2 - 1);
	gpu_mean[idx * 2 + 1] = variance;
	//update state
	states[idx] = localState;
	relativeIndex = *gpu_stateSize * idx;
	for(int i=0;i<*gpu_stateSize;i++){
		gpu_initialState[relativeIndex+i] = initialState[i];
	}
}
/**
 * for n=513-2048, maximum 64 integers
 */
__global__ void kernelConverge8(curandState_t* states, int* gpu_n, int* gpu_nf,
		int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F, int* gpu_varF,
		float* gpu_cij, float* gpu_p, int* gpu_initialState, float* gpu_mean,
		float* gpu_trajectory, int* gpu_trajectoryKernel, int* gpu_steps,
		int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	float p = *gpu_p;
	int initialStateCopy[64];
	int initialState[64];

	//printf("Finish outputting initial states!\n");
	// The first thread in the block does the allocation and initialization
	// and then shares the pointer with all other threads through shared memory,
	// so that access can easily be coalesced.

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("%d \n", varF[i]);
		}
	}

	__syncthreads();
	int relativeIndex = idx * *gpu_stateSize;
	for(int i=0;i<*gpu_stateSize;i++){
		initialState[i] = gpu_initialState[relativeIndex+i];
		initialStateCopy[i] = initialState[i];
	}

	//int steps = *gpu_steps;
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	bool perturbation = false;
	int prefix = gridDim.x * blockDim.x;
	int offset2 = -prefix;//gpu_currentTrajectorySize is the current trajectory size
	int prefix2 = *gpu_stateSize * prefix;
	int offset4 = relativeIndex - prefix2;

	float result = 0;
	float mean = 0;
	int elementF, startVarFIndex, resultStateSize, shifNum;
	for (int j = 0; j < *gpu_steps; j++) {
		result = 0;
		perturbation = false;
		offset2 += prefix;
		offset4 += prefix2;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				elementF = F[cumNf[i] + relativeIndex];
				startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex=initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
										& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		float times = 1;

		for(int i=0;i<*gpu_stateSize;i++){
			initialState[i] = initialStateCopy[i];
			gpu_trajectoryKernel[offset4+i] = initialStateCopy[i];
			result += ((unsigned int) initialState[i]) * times;
			times *= (float) powNum[1][31] * 2;
		}
		mean += result;
		gpu_trajectory[idx + offset2] = result;
	}
	prefix2 = *gpu_steps;		//prefix2 value changed
	mean = mean / prefix2;
	gpu_mean[idx * 2] = mean;
	offset2 = -prefix;
	float minu;
	float variance = 0;
	for (int i = 0; i < prefix2; i++) {
		offset2 += prefix;
		minu = gpu_trajectory[idx + offset2] - mean;
		variance += minu * minu;
	}
	variance = variance / (prefix2 - 1);
	gpu_mean[idx * 2 + 1] = variance;
	//update state
	states[idx] = localState;
	relativeIndex = *gpu_stateSize * idx;
	for(int i=0;i<*gpu_stateSize;i++){
		gpu_initialState[relativeIndex+i] = initialState[i];
	}
}
/**
 * run converge
 * gpu_currentTrajectorySize the current trajectory size, initially it is 0
 * gpu_steps how many steps to be simulated in this call
 */
__global__ void kernelConvergeInitial(curandState_t* states, int* gpu_n,
		int* gpu_nf, int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F,
		int* gpu_varF, float* gpu_cij, float* gpu_p, int* gpu_initialState,
		int* gpu_steps, int* gpu_stateSize) {

	int idx = threadIdx.x + blockIdx.x * blockDim.x;		//one register
	int n = *gpu_n; //make variables local; two registers

	//let's make shared memory
	float* cij = (float*) arrays;

	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	//int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
//	int* negativeIndex = (int*) &positiveIndex[*gpu_stateSize];
	//int* initialState = (int*) &negativeIndex[*gpu_stateSize];
	//int* initialStateCopy = (int*) &initialState[*gpu_stateSize * blockDim.x*gridDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p; //can be put in texture
	int initialState, initialStateCopy;

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//printf("i=%d, %d \t",i, varF[i]);
		}
	}

	__syncthreads();

	initialState = gpu_initialState[idx];
	initialStateCopy = initialState;
	int steps = *gpu_steps; //can be put in texture
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	int relativeIndex;
	bool perturbation = false;

	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << i); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			for (int i = 0; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					if (((initialState >> (varF[startVarFIndex + ind] % 32)) & 1)
							!= 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << i);
				//gpu_initialStateCopy._array[offset + i] = ((elementF & 1) ^ 0)
				//		== 1;
			}

		}
		//simulation finished
		//update initialState to the new state
		initialState = initialStateCopy;
		//if(idx==0)
		//	printf("state %d=%d\n",j,initialState);
	}
	//update state
	states[idx] = localState;
	relativeIndex = *gpu_stateSize * idx;
	gpu_initialState[relativeIndex] = initialState;
	//printf("idx=%d", idx);
}

/**
 * run converge initial for n=33-64
 * gpu_currentTrajectorySize the current trajectory size, initially it is 0
 * gpu_steps how many steps to be simulated in this call
 */
__global__ void kernelConvergeInitial2(curandState_t* states, int* gpu_n,
		int* gpu_nf, int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F,
		int* gpu_varF, float* gpu_cij, float* gpu_p, int* gpu_initialState,
		int* gpu_steps, int* gpu_stateSize) {

	int idx = threadIdx.x + blockIdx.x * blockDim.x;		//one register
	int n = *gpu_n; //make variables local; two registers

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	//int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	//int* negativeIndex = (int*) &positiveIndex[*gpu_stateSize];
	//int* initialState = (int*) &negativeIndex[*gpu_stateSize];
	//int* initialStateCopy = (int*) &initialState[*gpu_stateSize * blockDim.x*gridDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p; //can be put in texture
	int initialStateCopy, initialStateCopy2, initialState, initialState2;

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//	printf("i=%d, %d \t",i, varF[i]);
		}
	}

	__syncthreads();

	initialState = gpu_initialState[idx * 2];
	initialState2 = gpu_initialState[idx * 2 + 1];
	initialStateCopy = initialState;
	initialStateCopy2 = initialState2;

	int steps = *gpu_steps; //can be put in texture
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	int relativeIndex;
	bool perturbation = false;

	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		for (int i = 0; i < 32; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << i); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 32; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy2 = initialStateCopy2 ^ (1 << (i - 32)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (false&&!perturbation) {
			//printf("in kernelConvergInitial2, perturbation=false, execute.\n");
			for (int i = 0; i < 32; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << i);
			}
			for (int i = 32; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy2 ^= (-(elementF & 1) ^ initialStateCopy2)
						& (1 << (i - 32));
			}

		}
		//simulation finished
		//update initialState to the new state
		initialState = initialStateCopy;
		initialState2 = initialStateCopy2;
	}
	//update state
	states[idx] = localState;
	relativeIndex = 2 * idx;
	gpu_initialState[relativeIndex] = initialStateCopy;
	gpu_initialState[relativeIndex + 1] = initialStateCopy2;
	//printf("idx=%d", idx);
}

/**
 * run converge initial for n=65-96
 * gpu_currentTrajectorySize the current trajectory size, initially it is 0
 * gpu_steps how many steps to be simulated in this call
 */
__global__ void kernelConvergeInitial3(curandState_t* states, int* gpu_n,
		int* gpu_nf, int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F,
		int* gpu_varF, float* gpu_cij, float* gpu_p, int* gpu_initialState,
		int* gpu_steps, int* gpu_stateSize) {

	int idx = threadIdx.x + blockIdx.x * blockDim.x;		//one register
	int n = *gpu_n; //make variables local; two registers

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	//int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	//int* negativeIndex = (int*) &positiveIndex[*gpu_stateSize];
	//int* initialState = (int*) &negativeIndex[*gpu_stateSize];
	//int* initialStateCopy = (int*) &initialState[*gpu_stateSize * blockDim.x*gridDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p; //can be put in texture
	int initialStateCopy, initialStateCopy2, initialStateCopy3, initialState,
			initialState2, initialState3;

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
		}
	}

	__syncthreads();

	int relativeIndex = idx * 3;
	initialState = gpu_initialState[relativeIndex];
	initialState2 = gpu_initialState[relativeIndex + 1];
	initialState3 = gpu_initialState[relativeIndex + 2];
	initialStateCopy = initialState;
	initialStateCopy2 = initialState2;
	initialStateCopy3 = initialState3;

	int steps = *gpu_steps; //can be put in texture
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;

	bool perturbation = false;

	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		for (int i = 0; i < 32; i++) {
			rand = curand_uniform(&localState);
			//if (idx == 0)
			//	printf("\trand %d-%d: %f\n", j, i, rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << i); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 32; i < 64; i++) {
			rand = curand_uniform(&localState);
			//if (idx == 0)
			//	printf("\trand %d-%d: %f\n", j, i, rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy2 = initialStateCopy2 ^ (1 << (i - 32)); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 64; i < n; i++) {
			rand = curand_uniform(&localState);
			//if (idx == 0)
			//	printf("\trand %d-%d: %f\n", j, i, rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy3 = initialStateCopy3 ^ (1 << (i - 64)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			for (int i = 0; i < 32; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				//if (idx == 0)
				//	printf("\t no perturbation rand %d-%d: %f\n", j, i, rand);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << i);
			}
			for (int i = 32; i < 64; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				//if (idx == 0)
				//	printf("\t no perturbation rand %d-%d: %f\n", j, i, rand);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy2 ^= (-(elementF & 1) ^ initialStateCopy2)
						& (1 << (i - 32));
			}
			for (int i = 64; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				//if (idx == 0)
				//	printf("\t no perturbation rand %d-%d: %f\n", j, i, rand);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy3 ^= (-(elementF & 1) ^ initialStateCopy3)
						& (1 << (i - 64));
			}
		}
		//simulation finished
		//update initialState to the new state
		//gpu_initialState[idx * 3] = initialStateCopy;
		//gpu_initialState[idx * 3 + 1] = initialStateCopy2;
		//gpu_initialState[idx * 3 + 2] = initialStateCopy3;
		initialState=initialStateCopy;
		initialState2=initialStateCopy2;
		initialState3=initialStateCopy3;
	}
	//update state
	states[idx] = localState;
	relativeIndex = 3 * idx;
	gpu_initialState[relativeIndex] = initialStateCopy;
	gpu_initialState[relativeIndex + 1] = initialStateCopy2;
	gpu_initialState[relativeIndex + 2] = initialStateCopy3;
	//printf("idx=%d", idx);
}
/**
 * run converge initial for n=97-128
 * gpu_currentTrajectorySize the current trajectory size, initially it is 0
 * gpu_steps how many steps to be simulated in this call
 */
__global__ void kernelConvergeInitial4(curandState_t* states, int* gpu_n,
		int* gpu_nf, int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F,
		int* gpu_varF, float* gpu_cij, float* gpu_p, int* gpu_initialState,
		int* gpu_steps, int* gpu_stateSize) {

	int idx = threadIdx.x + blockIdx.x * blockDim.x;		//one register
	int n = *gpu_n; //make variables local; two registers

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	//int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	//int* negativeIndex = (int*) &positiveIndex[*gpu_stateSize];
	//int* initialState = (int*) &negativeIndex[*gpu_stateSize];
	//int* initialStateCopy = (int*) &initialState[*gpu_stateSize * blockDim.x*gridDim.x]; //Temporally using blockDim,need to verify

	float p = *gpu_p; //can be put in texture
	int initialStateCopy, initialStateCopy2, initialStateCopy3,
			initialStateCopy4;
	int initialState, initialState2, initialState3, initialState4;

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//	printf("i=%d, %d \t",i, varF[i]);
		}
	}

	__syncthreads();

	int relativeIndex = idx * 4;
	initialState = gpu_initialState[relativeIndex];
	initialState2 = gpu_initialState[relativeIndex + 1];
	initialState3 = gpu_initialState[relativeIndex + 2];
	initialState4 = gpu_initialState[relativeIndex + 3];

	initialStateCopy = initialState;
	initialStateCopy2 = initialState2;
	initialStateCopy3 = initialState3;
	initialStateCopy4 = initialState4;

	int steps = *gpu_steps; //can be put in texture
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	bool perturbation = false;

	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		for (int i = 0; i < 32; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy = initialStateCopy ^ (1 << i); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 32; i < 64; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy2 = initialStateCopy2 ^ (1 << (i - 32)); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 64; i < 96; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy3 = initialStateCopy3 ^ (1 << (i - 64)); //might use constant memory to replace i/32 and i%32
			}
		}
		for (int i = 96; i < n; i++) {
			rand = curand_uniform(&localState);
			//if(idx==0) printf("\trand %d-%d: %f\n",j,i,rand);
			if (rand < p) {
				perturbation = true;
				//relativeIndex = i / 32;
				//no need to care about initialState here, only operate on initialStateCopy
				initialStateCopy4 = initialStateCopy4 ^ (1 << (i - 96)); //might use constant memory to replace i/32 and i%32
			}
		}
		if (!perturbation) {
			for (int i = 0; i < 32; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy ^= (-(elementF & 1) ^ initialStateCopy)
						& (1 << i);
			}
			for (int i = 32; i < 64; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy2 ^= (-(elementF & 1) ^ initialStateCopy2)
						& (1 << (i - 32));
			}
			for (int i = 64; i < 96; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy3 ^= (-(elementF & 1) ^ initialStateCopy3)
						& (1 << (i - 64));
			}
			for (int i = 96; i < n; i++) {
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					if (relativeIndex == 0) {
						relativeIndex = initialState;
					} else if (relativeIndex == 1) {
						relativeIndex = initialState2;
					} else if (relativeIndex == 2) {
						relativeIndex = initialState3;
					}else if (relativeIndex == 3) {
						relativeIndex = initialState4;
					}
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy4 ^= (-(elementF & 1) ^ initialStateCopy4)
						& (1 << (i - 96));
			}
		}
		//simulation finished
		//update initialState to the new state
		initialState = initialStateCopy;
		initialState2 = initialStateCopy2;
		initialState3 = initialStateCopy3;
		initialState4 = initialStateCopy4;
	}
	//update state
	states[idx] = localState;
	relativeIndex = 4 * idx;
	gpu_initialState[relativeIndex] = initialStateCopy;
	gpu_initialState[relativeIndex + 1] = initialStateCopy2;
	gpu_initialState[relativeIndex + 2] = initialStateCopy3;
	gpu_initialState[relativeIndex + 3] = initialStateCopy4;
	//printf("idx=%d", idx);
}
/**
 * run converge initial for n=129-160
 * gpu_currentTrajectorySize the current trajectory size, initially it is 0
 * gpu_steps how many steps to be simulated in this call
 */
__global__ void kernelConvergeInitial5(curandState_t* states, int* gpu_n,
		int* gpu_nf, int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F,
		int* gpu_varF, float* gpu_cij, float* gpu_p, int* gpu_initialState,
		int* gpu_steps, int* gpu_stateSize) {

	int idx = threadIdx.x + blockIdx.x * blockDim.x;		//one register
	int n = *gpu_n; //make variables local; two registers

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];

	float p = *gpu_p; //can be put in texture
	int initialStateCopy[5];
	int initialState[5];

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//	printf("i=%d, %d \t",i, varF[i]);
		}
	}

	__syncthreads();

	int relativeIndex = idx * 5;
	initialState[0] = gpu_initialState[relativeIndex];
	initialState[1] = gpu_initialState[relativeIndex + 1];
	initialState[2] = gpu_initialState[relativeIndex + 2];
	initialState[3] = gpu_initialState[relativeIndex + 3];
	initialState[4] = gpu_initialState[relativeIndex + 4];

	initialStateCopy[0] = initialState[0];
	initialStateCopy[1] = initialState[1];
	initialStateCopy[2] = initialState[2];
	initialStateCopy[3] = initialState[3];
	initialStateCopy[4] = initialState[4];

	int steps = *gpu_steps; //can be put in texture
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	bool perturbation = false;

	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32
			}
		}

		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex = initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
						& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		for(int ll=0;ll<5;ll++)
			initialState[ll] = initialStateCopy[ll];
	}
	//update state
	states[idx] = localState;
	relativeIndex = 5 * idx;
	gpu_initialState[relativeIndex] = initialStateCopy[0];
	gpu_initialState[relativeIndex + 1] = initialStateCopy[1];
	gpu_initialState[relativeIndex + 2] = initialStateCopy[2];
	gpu_initialState[relativeIndex + 3] = initialStateCopy[3];
	gpu_initialState[relativeIndex + 4] = initialStateCopy[4];
	//printf("idx=%d", idx);
}
/**
 * run converge initial for n=161-192
 * gpu_currentTrajectorySize the current trajectory size, initially it is 0
 * gpu_steps how many steps to be simulated in this call
 */
__global__ void kernelConvergeInitial6(curandState_t* states, int* gpu_n,
		int* gpu_nf, int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F,
		int* gpu_varF, float* gpu_cij, float* gpu_p, int* gpu_initialState,
		int* gpu_steps, int* gpu_stateSize) {

	int idx = threadIdx.x + blockIdx.x * blockDim.x;		//one register
	int n = *gpu_n; //make variables local; two registers

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];

	float p = *gpu_p; //can be put in texture
	int initialStateCopy[6];
	int initialState[6];

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//	printf("i=%d, %d \t",i, varF[i]);
		}
	}

	__syncthreads();

	int relativeIndex = idx * 6;
	initialState[0] = gpu_initialState[relativeIndex];
	initialState[1] = gpu_initialState[relativeIndex + 1];
	initialState[2] = gpu_initialState[relativeIndex + 2];
	initialState[3] = gpu_initialState[relativeIndex + 3];
	initialState[4] = gpu_initialState[relativeIndex + 4];
	initialState[5] = gpu_initialState[relativeIndex + 5];

	initialStateCopy[0] = initialState[0];
	initialStateCopy[1] = initialState[1];
	initialStateCopy[2] = initialState[2];
	initialStateCopy[3] = initialState[3];
	initialStateCopy[4] = initialState[4];
	initialStateCopy[5] = initialState[5];

	int steps = *gpu_steps; //can be put in texture
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	bool perturbation = false;

	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32
			}
		}

		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex = initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
						& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		for(int ll=0;ll<6;ll++)
			initialState[ll] = initialStateCopy[ll];
	}
	//update state
	states[idx] = localState;
	relativeIndex = 6 * idx;
	gpu_initialState[relativeIndex] = initialStateCopy[0];
	gpu_initialState[relativeIndex + 1] = initialStateCopy[1];
	gpu_initialState[relativeIndex + 2] = initialStateCopy[2];
	gpu_initialState[relativeIndex + 3] = initialStateCopy[3];
	gpu_initialState[relativeIndex + 4] = initialStateCopy[4];
	gpu_initialState[relativeIndex + 5] = initialStateCopy[5];
	//printf("idx=%d", idx);
}
/**
 * run converge initial for n=193-512, maximum 16 integers
 * gpu_currentTrajectorySize the current trajectory size, initially it is 0
 * gpu_steps how many steps to be simulated in this call
 */
__global__ void kernelConvergeInitial7(curandState_t* states, int* gpu_n,
		int* gpu_nf, int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F,
		int* gpu_varF, float* gpu_cij, float* gpu_p, int* gpu_initialState,
		int* gpu_steps, int* gpu_stateSize) {

	int idx = threadIdx.x + blockIdx.x * blockDim.x;		//one register
	int n = *gpu_n; //make variables local; two registers

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];

	float p = *gpu_p; //can be put in texture
	int initialStateCopy[16];
	int initialState[16];

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//	printf("i=%d, %d \t",i, varF[i]);
		}
	}

	__syncthreads();

	int stateSize=*gpu_stateSize;
	int relativeIndex = idx * stateSize;
	for(int i=0;i<stateSize;i++){
		initialState[i] = gpu_initialState[relativeIndex+i];
		initialStateCopy[i] = initialState[i];
	}

	int steps = *gpu_steps; //can be put in texture
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	bool perturbation = false;

	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32
			}
		}

		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex = initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
						& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		for(int ll=0;ll<stateSize;ll++)
			initialState[ll] = initialStateCopy[ll];
	}
	//update state
	states[idx] = localState;
	relativeIndex = stateSize * idx;
	for(int i=0;i<stateSize;i++){
		gpu_initialState[relativeIndex+i] = initialStateCopy[i];
	}
	//printf("idx=%d", idx);
}
/**
 * run converge initial for n=513-2048, maximum 64 integers
 * gpu_currentTrajectorySize the current trajectory size, initially it is 0
 * gpu_steps how many steps to be simulated in this call
 */
__global__ void kernelConvergeInitial8(curandState_t* states, int* gpu_n,
		int* gpu_nf, int* gpu_nv, int* gpu_cumNf, int* gpu_cumNv, int* gpu_F,
		int* gpu_varF, float* gpu_cij, float* gpu_p, int* gpu_initialState,
		int* gpu_steps, int* gpu_stateSize) {

	int idx = threadIdx.x + blockIdx.x * blockDim.x;		//one register
	int n = *gpu_n; //make variables local; two registers

	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];

	float p = *gpu_p; //can be put in texture
	int initialStateCopy[64];
	int initialState[64];

	if (threadIdx.x == 0) {
		//printf("nF:\n");
		for (int i = 0; i < n; i++) {
			nf[i] = gpu_nf[i];
			cumNf[i] = gpu_cumNf[i];
		}
		cumNf[n] = gpu_cumNf[n];
		for (int i = 0; i < cumNf[n]; i++) {
			nv[i] = gpu_nv[i];
			cumNv[i] = gpu_cumNv[i];
			F[i] = gpu_F[i];
			cij[i] = gpu_cij[i];
		}

		cumNv[cumNf[n]] = gpu_cumNv[cumNf[n]];
		//printf("varF:\n");
		for (int i = 0; i < cumNv[cumNf[n]]; i++) {
			varF[i] = gpu_varF[i];
			//	printf("i=%d, %d \t",i, varF[i]);
		}
	}

	__syncthreads();

	int stateSize=*gpu_stateSize;
	int relativeIndex = idx * stateSize;
	for(int i=0;i<stateSize;i++){
		initialState[i] = gpu_initialState[relativeIndex+i];
		initialStateCopy[i] = initialState[i];
	}

	int steps = *gpu_steps; //can be put in texture
	// Copy state to local memory for efficiency
	curandState_t localState = states[idx];
	float rand;
	bool perturbation = false;

	for (int j = 0; j < steps; j++) {
		perturbation = false;
		//check perturbation
		int indexState=0,indexShift=0;
		for (int i = 0; i < n; i++) {
			rand = curand_uniform(&localState);
			if (rand < p) {
				perturbation = true;
				indexState=i/32;
				indexShift=indexState*32;
				initialStateCopy[indexState] = initialStateCopy[indexState] ^ (1 << (i-indexShift)); //might use constant memory to replace i/32 and i%32
			}
		}

		if (!perturbation) {
			indexShift=0;
			indexState=0;
			for (int i = 0; i < n; i++) {
				if(indexShift==32){
					indexState++;
					indexShift=0;
				}
				relativeIndex = 0;
				rand = curand_uniform(&localState);
				while (rand > cij[cumNf[i] + relativeIndex]) {
					relativeIndex++;
				}
				int elementF = F[cumNf[i] + relativeIndex];
				int startVarFIndex = cumNv[cumNf[i] + relativeIndex];
				int resultStateSize = cumNv[cumNf[i] + relativeIndex + 1]
						- startVarFIndex;
				int shifNum = 0;
				for (int ind = 0; ind < resultStateSize; ind++) {
					relativeIndex = varF[startVarFIndex + ind] / 32;
					relativeIndex = initialState[relativeIndex];
					if (((relativeIndex >> (varF[startVarFIndex + ind] % 32))
							& 1) != 0) {
						shifNum += powNum[1][ind];
					}
				}
				elementF = elementF >> shifNum; //after shifting, the last bit will be the value;

				//relativeIndex = i / 32;
				initialStateCopy[indexState] ^= (-(elementF & 1) ^ initialStateCopy[indexState])
						& (1 << (i - indexState*32));
				indexShift++;
			}
		}
		//simulation finished
		//update initialState to the new state
		for(int ll=0;ll<stateSize;ll++)
			initialState[ll] = initialStateCopy[ll];
	}
	//update state
	states[idx] = localState;
	relativeIndex = stateSize * idx;
	for(int i=0;i<stateSize;i++){
		gpu_initialState[relativeIndex+i] = initialStateCopy[i];
	}
	//printf("idx=%d", idx);
}
__global__ void kernelUpdateTrajectory(int* gpu_n, int* gpu_nf, int* gpu_nv,
		int* gpu_cumNf, int* gpu_cumNv, int* gpu_trajectoryKernel,
		int* gpu_steps, int* gpu_positiveIndex, int* gpu_negativeIndex,
		long* gpu_stateA, long* gpu_stateB, int* gpu_transitionsLastChain,
		int* gpu_bridge, int* gpu_stateSize) {
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	int n = *gpu_n; //make variables local
	int stateSize = *gpu_stateSize; //make variables local
	//if(idx==0)printf("gpu_stateSize=%d\n",gpu_stateSize[0]);
	//let's make shared memory
	float* cij = (float*) arrays;
	int* nf = (int*) &cij[gpu_cumNf[n]];
	int* nv = (int*) &nf[n];
	int* cumNf = (int*) &nv[gpu_cumNf[n]];
	int* cumNv = (int*) &cumNf[n + 1];
	int* F = (int*) &cumNv[gpu_cumNf[n] + 1];
	int* varF = (int*) &F[gpu_cumNf[n]];
	int* positiveIndex = (int*) &varF[gpu_cumNv[gpu_cumNf[n]]];
	int* negativeIndex = (int*) &positiveIndex[stateSize];
	//int* initialState = (int*) &negativeIndex[stateSize];
	//int* initialStateCopy = (int*) &initialState[stateSize * blockDim.x]; //Temporally using blockDim,need to verify

	if (threadIdx.x == 0) {

		for (int i = 0; i < stateSize; i++) {
			positiveIndex[i] = gpu_positiveIndex[i];
			//printf("positiveIndex[%d]=%d\n",i,positiveIndex[i]);
		}
		for (int i = 0; i < stateSize; i++) {
			negativeIndex[i] = gpu_negativeIndex[i];
			//printf("negativeIndex[%d]=%d\n",i,negativeIndex[i]);
		}
	}

	__syncthreads();

	n = *gpu_steps; //use n to store steps
	int offset = idx * stateSize;
	int prefix = stateSize * blockDim.x * gridDim.x;
	int stateA = 0; //how many steps are in state A
	int stateB = 0;
	int transitions[2][2]; //maybe put this in shared memory to speed up
	for (int i = 0; i < 2; i++) {
		for (int j = 0; j < 2; j++) {
			transitions[i][j] = 0;
		}
	}
	int index1, bridge;
	//printf("\n");
	if (checkMetaStateInt(stateSize, positiveIndex, negativeIndex,
			gpu_trajectoryKernel, offset)) {
		stateA++;
		index1 = 1;
	} else {
		stateB++;
		index1 = 0;
	}
	//printf("stateA=%d,stateB=%d\n",stateA,stateB);
	for (int j = 1; j < n; j++) {
		bridge = index1;
		offset = offset + prefix;

		if (checkMetaStateInt(stateSize, positiveIndex, negativeIndex,
				gpu_trajectoryKernel, offset)) {
			stateA++;
			index1 = 1;
		} else {
			stateB++;
			index1 = 0;
		}
		transitions[bridge][index1]++;
		//if(idx==0)printf("currentState=(%d,%d,%d), check=%d\n",currentState._array[offset + 0],currentState._array[offset + 1],currentState._array[offset + 2],index1);
		//if(idx==0)printf("initialState[%d]=%d,index1=%d\n",offsetBlock,initialState[offsetBlock],index1);
	}

	//printf("idx=%d,stateA=%d,stateB=%d\n",idx,stateA,stateB);
	//copy local data to global data
	gpu_bridge[idx] = index1;
	gpu_stateA[idx] = stateA;
	gpu_stateB[idx] = stateB;
	offset = idx * 4;
	gpu_transitionsLastChain[offset] = transitions[0][0];
	gpu_transitionsLastChain[offset + 1] = transitions[0][1];
	gpu_transitionsLastChain[offset + 2] = transitions[1][0];
	gpu_transitionsLastChain[offset + 3] = transitions[1][1];
		/*printf(
	 "idx=%d,transitions=%d,transitions=%d,transitions[1][0]=%d,transitions[1][1]=%d\n",
	 idx, transitions[0][0], transitions[0][1], transitions[1][0],
	 transitions[1][1]);*/
}



int fromVector(vector<bool> myvector) {
	int retval = 0;
	int i = 0;
	for (vector<bool>::iterator it = myvector.begin(); it != myvector.end();
			it++, i++) {
		if (*it) {
			retval |= 1 << i;
		}
	}
	return retval;
}

int fromVector(jboolean *myvector,int len) {
	int retval = 0;
	int i = 0;
	for (i=0;i<len;i++) {
		if (myvector[i]) {
			retval |= 1 << i;
		}
	}
	return retval;
}

// alpha is stored in alphabeta[0]; and beta is stored in alphabeta[1]
void calAlphaBeta(long transitionsLast[][2], float* alphabeta) {
	//delete the first n transitions (n state) in each chain.

	if (transitionsLast[0][0] + transitionsLast[0][1] == 0)
		alphabeta[1] = 0;
	else
		alphabeta[1] = (float) transitionsLast[0][1]
				/ (float) (transitionsLast[0][0] + transitionsLast[0][1]);
	if (transitionsLast[1][0] + transitionsLast[1][1] == 0)
		alphabeta[0] = 0;
	else
		alphabeta[0] = (float) transitionsLast[1][0]
				/ (float) (transitionsLast[1][0] + transitionsLast[1][1]);
	//printf("Calculated alpha=%f, beta=%f\n",alphabeta[0],alphabeta[1]);
}
/**
 * sizeSharedMemory1 is the amount of shared memory used to store the PBN and the property, not including the initial states
 * blockInfor[0] will store blockSize; blockInfor[1] will store # Blocks
 */
void computeDeviceInfor(int sizeSharedMemory1, int stateSize, int* blockInfor) {
	//get device information
	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, 0);	//always use the first device
	int numMP = prop.multiProcessorCount;
	//int maxThreadPerBlock = 1024;
	int maxSharedMemoryPerBlock = prop.sharedMemPerBlock;	//in bytes
	//int maxRegistersPerBlock; //in unit. 1 unit=32 bits
	int numRegisterPerMP = 64 * 1024; //in unit. 1 unit=32 bits, specially for K20m
	int major = prop.major;
	int minor = prop.minor;
	int wrapSize = 32;
	//int countSize = 0;
	int numBlock, blockSize;// numRegistersPerBlock;
	//int allowedThreadPerBlock;
	int maxBlocksPerMP = 16; //Maximum number of resident blocks per multiprocessor
	bool possible = true;
	int occupancy = 0;
	int selectBlockSize = 32, selectNumBlock = 1;
	int activeBLocksPerMp;
	if (major < 3 || (major < 4 && minor < 2)) {
		numRegisterPerMP = 32 * 1024;
		printf(
				"Caution! The device computational capability is too small. The program requires computational capability to be bigger than 3.0.\n");
	}
	if (major < 3) {
		maxBlocksPerMP = 8;
	} else if (major >= 5) {
		maxBlocksPerMP = 32;
	}
	/*if (major == 5 && minor == 3) {
		maxRegistersPerBlock = 32 * 1024;
	} else {
		maxRegistersPerBlock = 64 * 1024;
	}*/
	if (sizeSharedMemory1 > maxSharedMemoryPerBlock) {
		printf("The PBN is too large to handle with the current device.\n");
	}
	int countBlockSize = 1, countBlocks = 1;
	blockSize = countBlockSize * wrapSize;
	numBlock = countBlocks * numMP;
	//countSize = sizeSharedMemory1
		//	+ blockSize * numBlock * stateSize * sizeof(int);
	int i = 1, tmp = 0;
	while (possible) {
		possible = false;
		countBlockSize = 1;
		activeBLocksPerMp = countBlocks;
		if (activeBLocksPerMp > maxBlocksPerMP)
			activeBLocksPerMp = maxBlocksPerMP;
		blockSize = countBlockSize * wrapSize;
		//printf("(numRegisterPerMP / countBlocks) / RegPerThread=%f\n",(numRegisterPerMP / countBlocks) / RegPerThread);
		while (blockSize < (numRegisterPerMP / countBlocks) / RegPerThread
				&& sizeSharedMemory1 < maxSharedMemoryPerBlock / countBlocks) {
			//shared memory limit
			if (maxSharedMemoryPerBlock / countBlocks < activeBLocksPerMp)
				activeBLocksPerMp = maxSharedMemoryPerBlock / countBlocks;
			//register limit
			if ((numRegisterPerMP / RegPerThread / blockSize)
					< activeBLocksPerMp) {
				activeBLocksPerMp =
						(numRegisterPerMP / RegPerThread / blockSize);
			}
			tmp = activeBLocksPerMp * blockSize;
			if (tmp > occupancy
					|| (tmp == occupancy && numBlock > selectNumBlock)) {
				occupancy = tmp;
				selectBlockSize = blockSize;
				selectNumBlock = numBlock;
			}
			possible = true;
			//printf(
			//		"Possible solution %d: blockSize = %d, numBlock=%d, occupancy=%d.\n",
			//		i, blockSize, numBlock, tmp);
			countBlockSize++;
			blockSize = countBlockSize * wrapSize;
			i++;
		}
		countBlocks++;
		numBlock = countBlocks * numMP;
		//numRegistersPerBlock = numRegisterPerMP / countBlocks;
	}

	//printf("Choose solution: blockSize = %d, numBlock=%d.\n", selectBlockSize,
	//		selectNumBlock);
	blockInfor[0] = selectBlockSize;
	blockInfor[1] = selectNumBlock;
	//printf("block=%d,blockSize=%d\n", numBlock, blockSize);
}

/*
 * Class:     simulationMethod_GermanGPU
 * Method:    getCudaDeviceCount
 * Signature: ()I
 */
JNIEXPORT jint JNICALL Java_simulationMethod_GermanGPU_getCudaDeviceCount
  (JNIEnv *env, jclass cls){
	jint nDevices;
	cudaGetDeviceCount(&nDevices);
	return nDevices;
}

//------------------------------------------------------------------------------

/*
 * Class:     simulationMethod_GermanGPU
 * Method:    initialGPUPBN
 * Signature: (LPBN/PBN;)V
 */
JNIEXPORT void JNICALL Java_simulationMethod_GermanGPU_initialGPUPBN
  (JNIEnv *env, jclass cls, jobject lpbn){
  /**
 public int getN();
    descriptor: ()I

  public int[] getNf();
    descriptor: ()[I

  public java.util.List<java.lang.Integer> getNv();
    descriptor: ()Ljava/util/List;

  public java.util.List<boolean[]> getF();
    descriptor: ()Ljava/util/List;

  public java.util.List<java.util.BitSet> getVarF();
    descriptor: ()Ljava/util/List;

  public java.util.List<int[]> getVarFInt();
    descriptor: ()Ljava/util/List;

  public java.util.List<double[]> getCij();
    descriptor: ()Ljava/util/List;

  public double getPerturbation();
    descriptor: ()D

  public void setNpNode(java.util.List<java.lang.Integer>);
    descriptor: (Ljava/util/List;)V

  public java.util.List<java.lang.Integer> npNode();
    descriptor: ()Ljava/util/List;

  public abstract boolean getNextNodeValue(int, PBN.StateBit);
    descriptor: (ILPBN/StateBit;)Z

  public abstract int getStateLength();
    descriptor: ()I


  */
  jclass pbncl = env->GetObjectClass(lpbn);
  jmethodID mid=env->GetMethodID(pbncl,"getN","()I");
  n= env->CallIntMethod(lpbn,mid);
  //initialise statesize
  stateSize = n / 32;
  if (stateSize * 32 < n)
	  stateSize++;

  mid=env->GetMethodID(pbncl,"getNf","()[I");
  jintArray lnf=(jintArray)env->CallObjectMethod(lpbn,mid);
  nf=(int *)malloc(sizeof(int)*n);
  (*env).GetIntArrayRegion( lnf, 0, n, &nf[0] );
  cout<<n<<endl;
  mid=env->GetMethodID(pbncl,"getNv","()Ljava/util/List;");
  jobject lnv=env->CallObjectMethod(lpbn,mid);
  // retrieve the java.util.List interface class
	jclass cList = env->FindClass("java/util/List");
  // retrieve the size and the get method
	jmethodID mSize = env->GetMethodID(cList, "size", "()I");
	
	jmethodID mGet = env->GetMethodID(cList, "get", "(I)Ljava/lang/Object;");
	// get the size of the list
	jint size = env->CallIntMethod(lnv, mSize);
	nv=(int *)malloc(sizeof(int)*size);
	
	jclass cInt=env->FindClass("java/lang/Integer");
	jmethodID getValue = env->GetMethodID(cInt, "intValue", "()I");
	int cumNv=0;
	cout<<"nv, size="<<size<<endl;
	for(int i=0;i<size;i++){
		jobject jInteger=env->CallObjectMethod(lnv, mGet, i);
		nv[i]=env->CallIntMethod(jInteger,getValue);
		cumNv+=nv[i];
		cout<<nv[i]<<" ";
	}
	cout<<endl;
	//F
	int sizeF;
	mid=env->GetMethodID(pbncl,"getF","()Ljava/util/List;");
	jobject lF=env->CallObjectMethod(lpbn,mid);
	size = env->CallIntMethod(lF, mSize);
	sizeF=size;
	F=(int *)malloc(sizeof(int)*size);
	cout<<"F"<<endl;
	for(int i=0;i<size;i++){
		jbooleanArray elementF=(jbooleanArray)env->CallObjectMethod(lF, mGet, i);
		jsize len = (*env).GetArrayLength(elementF);
    	//std::vector<bool> elementFVector( len );
    	jboolean elementFVector[len];
		(*env).GetBooleanArrayRegion( elementF, 0, len, &elementFVector[0] );
		F[i]=fromVector(elementFVector,len);
		cout<<F[i]<<" ";
	}
	//varF
	mid=env->GetMethodID(pbncl,"getVarFInt","()Ljava/util/List;");
	jobject lvarF=env->CallObjectMethod(lpbn,mid);
	size = env->CallIntMethod(lvarF, mSize);
	varF=(int *)malloc(sizeof(int)*cumNv);
	int index=0;
	cout<<"varF"<<endl;
	for(int i=0;i<size;i++){
		jintArray elementVarF=(jintArray)env->CallObjectMethod(lvarF, mGet, i);
		jsize len = (*env).GetArrayLength(elementVarF);
		(*env).GetIntArrayRegion( elementVarF, 0, len, &varF[index] );	
		for(int j=0;j<len;j++){
			cout<<varF[j+index]<<"\t";
		}
		cout<<endl;
		index+=len;
	}
	//cij
	mid=env->GetMethodID(pbncl,"getCij","()Ljava/util/List;");
	jobject lcij=env->CallObjectMethod(lpbn,mid);
	size = env->CallIntMethod(lcij, mSize);
	cij=(float *)malloc(sizeof(float)*sizeF);
	double cij_d[sizeF];
	index=0;
	for(int i=0;i<size;i++){
		jdoubleArray elementcij=(jdoubleArray)env->CallObjectMethod(lcij, mGet, i);
		jsize len = (*env).GetArrayLength(elementcij);
		(*env).GetDoubleArrayRegion( elementcij, 0, len, &cij_d[index] );
		index+=len;	
	}
	for(int i=0;i<sizeF;i++){
		cij[i]=cij_d[i];
		cout<<cij[i]<<"\t";
	}
	cout<<endl;
	//p
	mid=env->GetMethodID(pbncl,"getPerturbation","()D");
	jdouble perturbation=env->CallDoubleMethod(lpbn,mid);
	p=perturbation;
	cout<<"\n"<<p;
  }

/*
 * Class:     simulationMethod_GermanGPU
 * Method:    initialGPUExpression
 * Signature: ([J[J)V
 */
JNIEXPORT void JNICALL Java_simulationMethod_GermanGPU_initialGPUExpression
  (JNIEnv *env, jclass cls, jlongArray lpositiveIndex, jlongArray lnegativeIndex){
	jsize len = (*env).GetArrayLength(lpositiveIndex);
	jlong *body = (*env).GetLongArrayElements(lpositiveIndex, 0);
	g_positiveIndex=(int *)malloc(sizeof(int)*stateSize);
	g_negativeIndex=(int *)malloc(sizeof(int)*stateSize);
	for(int i=0;i<len;i++){
		g_positiveIndex[i*2]=(int)body[i];
		if(i*2+1<stateSize)
			g_positiveIndex[i*2+1]=(int)(body[i]>>32);
	}
	body = (*env).GetLongArrayElements(lnegativeIndex, 0);
	for(int i=0;i<len;i++){
		g_negativeIndex[i*2]=(int)body[i];
		if(i*2+1<stateSize)
			g_negativeIndex[i*2+1]=(int)(body[i]>>32);
	}
	for(int i=len*2;i<stateSize;i++){
		g_positiveIndex[i]=0;
		g_negativeIndex[i]=0;
	}
  }

/*
 * Class:     simulationMethod_GermanGPU
 * Method:    initialGerman
 * Signature: ([D)V
 */
JNIEXPORT void JNICALL Java_simulationMethod_GermanGPU_initialGerman
  (JNIEnv *env, jclass cls, jdoubleArray parameters){
	jsize len = (*env).GetArrayLength(parameters);
	jdouble *body = (*env).GetDoubleArrayElements(parameters, 0);
	precision=body[0];
	confidence=body[1];
	epsilon_twostate=body[2];
	blockInfor[0]=0;
	if(len>4){
		blockInfor[0]=body[3];
		blockInfor[1]=body[4];
	}
	//cout<<precision<<"\t"<<confidence<<"\t"<<epsilon_twostate<<"\n";
  }

/*
 * Class:     simulationMethod_GermanGPU
 * Method:    setOutputName
 * Signature: (Ljava/lang/String;)V
 */
JNIEXPORT void JNICALL Java_simulationMethod_GermanGPU_setOutputName
  (JNIEnv *env, jclass cls, jstring name){
	const char *s = env->GetStringUTFChars(name,NULL);
	outputName=s;
	//cout<<outputName<<endl;
}

/*
 * Class:     simulationMethod_GermanGPU
 * Method:    run
 * Signature: ()[D
 */
JNIEXPORT jdoubleArray JNICALL Java_simulationMethod_GermanGPU_run
  (JNIEnv *env, jclass cls){
  	int block = 2, blockSize = 3;
	int steps = 100;			//german and rubin n
	int* gpu_steps;
	float r = precision;
	int argCount = 1;

	ofstream output;



	//pbn = io.loadPBN(argv[argCount]);
	argCount += 2;
	//argv[2]=property file name
	output.open(outputName, ios::out | ios::app);

	steps = 100;//stoi(argv[argCount]);


	int N = block * blockSize;
	output << "***************************\n";
	output << "running two-state on model ";
	//output << argv[1];


	std::clock_t cpu_start;
	double duration;
	cpu_start = std::clock();

	int size_sharedMemory = 0;

	int* gpu_n;
	int* gpu_nf;

	//calculate cumulative number of functions
	int cumNf[n + 1];
	int* gpu_cumNf;
	cumNf[0] = 0;
	for (int i = 0; i < n; i++) {
		cumNf[i + 1] = cumNf[i] + nf[i];
	}

	int* gpu_nv;

	//calculate cumulative number of variables
	int cumNv[cumNf[n] + 1];
	int* gpu_cumNv;
	cumNv[0] = 0;
	for (int i = 0; i < cumNf[n]; i++) {
		cumNv[i + 1] = cumNv[i] + nv[i];
	}

	//declare F
   //each boolean function is stored with an integer, assuming a Boolean function can have maximum 5 parent nodes
	int* gpu_F;


	//declare varF
	int* gpu_varF;
	int count = 0;

	float* gpu_p;

	float* gpu_cij;
	count = 0;
	float sum;
	for (int i = 0; i < n; i++) {
		sum = 0;
		for (int j = 0; j < nf[i]; j++) {
			sum += cij[count];
			cij[count] = sum + epsilon;
			//cout<<cij[count]<<"\t";
			count++;
		}
		//cout<<endl;
		//make sure that the last element equals to 1
		if (cij[count - 1] < 1) {
			cij[count - 1] = 1 + epsilon;
		}
	}
	

	//printf("stateSize=%d\n", stateSize);

	/*cout<<"state positive negative";
	for(int i=0;i<stateSize;i++){
		cout<<g_positiveIndex[i]<<"\t"<<g_negativeIndex[i];
	}
	cout<<"\n";

	cout<<"n="<<n<<endl;
	cout<<"nf"<<endl;
	for(int i=0;i<n;i++){
		cout<<nf[i]<<"\t";
	}
	cout<<"\ncumNf"<<endl;
	for(int i=0;i<n+1;i++){
		cout<<cumNf[i]<<"\t";
	}
	cout<<"\nnv"<<endl;
	for(int i=0;i<cumNf[n];i++){
		cout<<nv[i]<<"\t";
	}
	cout<<"\ncumNv"<<endl;
	for(int i=0;i<cumNf[n]+1;i++){
		cout<<cumNv[i]<<"\t";
	}
	cout<<"\nF"<<endl;
	for(int i=0;i<cumNf[n];i++){
		cout<<F[i]<<"\t";
	}
	cout<<"\nvarF"<<endl;
	for(int i=0;i<cumNv[cumNf[n]];i++){
		cout<<varF[i]<<"\t";
	}

	cout<<stateSize<<endl;*/

	size_sharedMemory += n * sizeof(int); //nf
	size_sharedMemory += cumNf[n] * sizeof(int); //nv
	size_sharedMemory += (n + 1) * sizeof(int); //cumNf
	size_sharedMemory += (cumNf[n] + 1) * sizeof(int); //cumNv
	size_sharedMemory += cumNf[n] * sizeof(int); //F
	size_sharedMemory += cumNv[cumNf[n]] * sizeof(int); //varF
	size_sharedMemory += cumNf[n] * sizeof(float); //cij
	size_sharedMemory += stateSize * sizeof(int); //positiveIndex
	size_sharedMemory += stateSize * sizeof(int); //negativeIndex
	//size_sharedMemory += sizeof(int); //n
	//printf("sharedMemorySize1=%d bytes\n", size_sharedMemory);
	//size_sharedMemory=size_sharedMemory*2;

	if (blockInfor[0]!=0) {
		blockSize = blockInfor[1];
		block = blockInfor[0];
	} else {
		computeDeviceInfor(size_sharedMemory, stateSize, blockInfor);
		block = blockInfor[1];
		blockSize = blockInfor[0];
	}

	N = block * blockSize;
	//size_sharedMemory += stateSize * N * sizeof(int);
	//printf("sharedMemorySize=%d bytes\n", size_sharedMemory);
	output << ", blockSize=" << blockSize << ", block=" << block
			<< ", precision=" << r << ", sharedMemorySize=" << size_sharedMemory
			<< " bytes.\n";

	//duration = (std::clock() - cpu_start) / (double) CLOCKS_PER_SEC;

	//output << "time duration 1: " << duration << "s\n";
	//cpu_start = std::clock();

	float memsettime;
	cudaEvent_t start, stop;

	// initialize CUDA timer
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);

	//for German and Runbin method
	int currentTrajectorySize = 0; // store the current trajectory size

	int* gpu_currentTrajectorySize;
	int trajectoryLength = 3000;
	float mean[2 * N];
	float* gpu_mean;

	float* gpu_trajectory;

	int* gpu_initialState;
	//int* gpu_initialStateCopy;
	int* gpu_stateSize;
	int* gpu_trajectoryKernel;
	int* gpu_positiveIndex;
	int* gpu_negativeIndex;

	//allocate method in device
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_n, sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_nf, n * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_nv, cumNf[n] * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_cumNf, (n + 1) * sizeof(int)));
	HANDLE_ERROR(
			cudaMalloc((void** ) &gpu_cumNv, (cumNf[n] + 1) * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_F, (cumNf[n]) * sizeof(int)));
	HANDLE_ERROR(
			cudaMalloc((void** ) &gpu_varF, (cumNv[cumNf[n]]) * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_cij, (cumNf[n]) * sizeof(float)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_p, sizeof(float)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_steps, sizeof(int)));
	//HANDLE_ERROR(cudaMalloc((void**) &gpu_countGPU, n * N * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_currentTrajectorySize, sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_mean, 2 * N * sizeof(float)));
	HANDLE_ERROR(
			cudaMalloc((void** ) &gpu_trajectory,
					N * trajectoryLength * sizeof(float)));
	HANDLE_ERROR(
			cudaMalloc((void** ) &gpu_initialState,
					stateSize * N * sizeof(int)));
	//HANDLE_ERROR(cudaMalloc((void**) &gpu_initialStateCopy, stateSize * N * sizeof(int)));
	HANDLE_ERROR(
			cudaMalloc((void** ) &gpu_positiveIndex, stateSize * sizeof(int)));
	HANDLE_ERROR(
			cudaMalloc((void** ) &gpu_negativeIndex, stateSize * sizeof(int)));
	HANDLE_ERROR(
			cudaMalloc((void** ) &gpu_trajectoryKernel,
					trajectoryLength * stateSize * N * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_stateSize, sizeof(int)));

	//copy data from host to device
	HANDLE_ERROR(cudaMemcpy(gpu_n, &n, sizeof(int), cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_nf, nf, n * sizeof(int), cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_nv, nv, cumNf[n] * sizeof(int),
					cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_cumNf, cumNf, (n + 1) * sizeof(int),
					cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_cumNv, cumNv, (cumNf[n] + 1) * sizeof(int),
					cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_F, F, cumNf[n] * sizeof(int),
					cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_varF, varF, (cumNv[cumNf[n]]) * sizeof(int),
					cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_cij, cij, cumNf[n] * sizeof(float),
					cudaMemcpyHostToDevice));
	HANDLE_ERROR(cudaMemcpy(gpu_p, &p, sizeof(float), cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_steps, &steps, sizeof(int), cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_currentTrajectorySize, &currentTrajectorySize,
					sizeof(int), cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_stateSize, &stateSize, sizeof(int),
					cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_positiveIndex, g_positiveIndex,
					stateSize * sizeof(int), cudaMemcpyHostToDevice));
	HANDLE_ERROR(
			cudaMemcpy(gpu_negativeIndex, g_negativeIndex,
					stateSize * sizeof(int), cudaMemcpyHostToDevice));

	// host constant data
	int hPowNum[2][32];
	hPowNum[1][0] = 1;
	hPowNum[0][0] = 0;
	for (int i = 1; i < 32; i++) {
		hPowNum[0][i] = 0;
		hPowNum[1][i] = hPowNum[1][i - 1] * 2;
		//printf("powNum[1][%d]=%d, %f\t",i,hPowNum[1][i],(float)hPowNum[1][i]);
	}
	//copy host data to constant memory
	cudaMemcpyToSymbol(powNum, hPowNum, sizeof(int) * 32 * 2);

	// CUDA's random number library uses curandState_t to keep track of the seed value
	// we will store a random state for every thread
	curandState_t* states;

	// allocate space on the GPU for the random states
	HANDLE_ERROR(cudaMalloc((void** ) &states, N * sizeof(curandState_t)));

	// invoke the GPU to initialize all of the random states
	//init<<<block, blockSize>>>(0, states);
	init<<<block, blockSize>>>(time(0), states);

	German german;
	float psrf;
	bool done = false, done1 = false;
	float threshold = 1e-3; //judge when to converge
	currentTrajectorySize = 0;

	//german and rubin method
	if (n < 33) {
		kernelConvergeInitial<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,gpu_stateSize);
	} else if (n < 65) {
		kernelConvergeInitial2<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,gpu_stateSize);
	} else if (n < 97) {
		kernelConvergeInitial3<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,gpu_stateSize);
	} else if (n < 129) {
		kernelConvergeInitial4<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,gpu_stateSize);
	}else if (n < 161) {
		kernelConvergeInitial5<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,gpu_stateSize);
	}else if (n < 193) {
		kernelConvergeInitial6<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,gpu_stateSize);
	}else if (n < 513) {
		kernelConvergeInitial7<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,gpu_stateSize);
	}else if (n < 2049) {
		kernelConvergeInitial8<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,gpu_stateSize);
	}
	currentTrajectorySize = steps;
	while (!done) {
	//call kernel function, need to allocate the shared method size here as the third parameters
		if (n < 33) {
			kernelConverge<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_mean,gpu_trajectory,gpu_trajectoryKernel,gpu_steps,gpu_stateSize);
		} else if (n < 65) {
			kernelConverge2<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_mean,gpu_trajectory,gpu_trajectoryKernel,gpu_steps,gpu_stateSize);
		} else if (n < 97) {
			kernelConverge3<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_mean,gpu_trajectory,gpu_trajectoryKernel,gpu_steps,gpu_stateSize);
		} else if (n < 129) {
			kernelConverge4<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_mean,gpu_trajectory,gpu_trajectoryKernel,gpu_steps,gpu_stateSize);
		}else if (n < 161) {
			kernelConverge5<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_mean,gpu_trajectory,gpu_trajectoryKernel,gpu_steps,gpu_stateSize);
		}else if (n < 193) {
			kernelConverge6<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_mean,gpu_trajectory,gpu_trajectoryKernel,gpu_steps,gpu_stateSize);
		}else if (n < 513) {
			kernelConverge7<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_mean,gpu_trajectory,gpu_trajectoryKernel,gpu_steps,gpu_stateSize);
		}else if (n < 2049) {
			kernelConverge8<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_mean,gpu_trajectory,gpu_trajectoryKernel,gpu_steps,gpu_stateSize);
		}
		currentTrajectorySize = currentTrajectorySize + steps;
		//printf("currentTrajectorySize=%d\n",currentTrajectorySize);
		HANDLE_ERROR(
				cudaMemcpy(mean, gpu_mean, 2 * N * sizeof(float), cudaMemcpyDeviceToHost));

		psrf = german.computePstr(mean, currentTrajectorySize, n, N);

		if (abs(1 - psrf) > threshold) {
			done = false;
			done1 = false;
			steps = currentTrajectorySize;
		} else {
			if (done1) {
				done = true;
			} else {
				done1 = true;
				steps = currentTrajectorySize;
			}
		}
		if (steps > trajectoryLength) {
			done = true;
			steps = currentTrajectorySize / 2;
		}
		HANDLE_ERROR(
				cudaMemcpy(gpu_steps, &steps, sizeof(int), cudaMemcpyHostToDevice));
	}

	//printf("converged! current trajectory size = %d\n", currentTrajectorySize);
	cudaFree(gpu_mean);
	cudaFree(gpu_trajectory);

 //*****************************two state***************************

	float invPhi = ltqnorm(0.5 * (1 + confidence));
	int kstep = 1;
	long stateASum = 0, stateBSum = 0;
	long stateA[N];			//how many steps are in state A
	for (int i = 0; i < N; i++) {
	stateA[i] = 0;
	}
	long stateB[N];
				//element 0=transitions from meta state B to meta state B
				//element 1=transitions from meta state B to meta state A
				//element 2=transitions from meta state A to meta state B
				//element 3=transitions from meta state A to meta state A
	long transitionsLast[2][2];
	int transitionsLastChain[N * 4];

	long* gpu_stateA;
	long* gpu_stateB;
	int* gpu_transitionsLastChain;
	int* gpu_bridge;

	HANDLE_ERROR(
	cudaMalloc((void** ) &gpu_transitionsLastChain, N * 4 * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_stateA, N * sizeof(long)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_stateB, N * sizeof(long)));
	HANDLE_ERROR(cudaMalloc((void** ) &gpu_bridge, N * sizeof(int)));


	kernelUpdateTrajectory<<<block,blockSize,size_sharedMemory>>>(gpu_n, gpu_nf,
					gpu_nv, gpu_cumNf, gpu_cumNv,
					gpu_trajectoryKernel, gpu_steps, gpu_positiveIndex, gpu_negativeIndex,
					gpu_stateA, gpu_stateB, gpu_transitionsLastChain,gpu_bridge,gpu_stateSize);

							HANDLE_ERROR(
	cudaMemcpy(stateA, gpu_stateA, N * sizeof(long), cudaMemcpyDeviceToHost));
	HANDLE_ERROR(
	cudaMemcpy(stateB, gpu_stateB, N * sizeof(long), cudaMemcpyDeviceToHost));
	HANDLE_ERROR(
	cudaMemcpy(transitionsLastChain, gpu_transitionsLastChain, 4 * N * sizeof(int),
	cudaMemcpyDeviceToHost));

	transitionsLast[0][0] = 0;
	transitionsLast[0][1] = 0;
	transitionsLast[1][0] = 0;
	transitionsLast[1][1] = 0;
	for (int i = 0; i < N; i++) {
	//printf("stateA[%d]=%d, stateb[%d]=%d, transitionsLastChain[%d-1]=%d, transitionsLastChain[%d-2]=%d, transitionsLastChain[%d-3]=%d, transitionsLastChain[%d-4]=%d\n",i,stateA[i],i,stateB[i],i,transitionsLastChain[i*4],i,transitionsLastChain[i*4+1],i,transitionsLastChain[i*4+2],i,transitionsLastChain[i*4+3]);
	stateASum += stateA[i];
	stateBSum += stateB[i];
	transitionsLast[0][0] += transitionsLastChain[i * 4];
	transitionsLast[0][1] += transitionsLastChain[i * 4 + 1];
	transitionsLast[1][0] += transitionsLastChain[i * 4 + 2];
	transitionsLast[1][1] += transitionsLastChain[i * 4 + 3];
	}
	output << "stateASum=" << stateASum << " stateBSum=" << stateBSum << endl;
	output << "transitionsLast[0][0]=" << transitionsLast[0][0]

	<< ", transitionsLast[0][1]=" << transitionsLast[0][1]

	<< ", transitionsLast[1][0]=" << transitionsLast[1][0]
	<< ", transitionsLast[1][1]=" << transitionsLast[1][1] << endl;
	output << "Prob. is " << (float) stateASum / (float) (stateBSum + stateASum)
	<< "\n";

	float alphabeta[2];
	alphabeta[0] = 0;
	alphabeta[1] = 0;
	calAlphaBeta(transitionsLast, alphabeta);

	output << "alpha=" << alphabeta[0] << ", beta=" << alphabeta[1] << "\n";

	long TSN = ceil(
	*alphabeta * *(alphabeta + 1) * (2 - *alphabeta - *(alphabeta + 1))
	/ (pow(*alphabeta + *(alphabeta + 1), 3) * pow(r / invPhi, 2))) * kstep;
	long maxLength = transitionsLast[0][0] + transitionsLast[0][1]
	+ transitionsLast[1][0] + transitionsLast[1][1];
	int extensionPerChain;
	int round = 0;
	output << "maxLength=" << maxLength << ", N=" << TSN << "\n";
	while (TSN > maxLength) {
	extensionPerChain = (int) ceil((TSN - maxLength) * kstep / (double) N);	//consider kstep here
	output << "extensionPerChain=" << extensionPerChain << endl;
	HANDLE_ERROR(
	cudaMemcpy(gpu_steps, &extensionPerChain, sizeof(int), cudaMemcpyHostToDevice));
	if (n < 33) {
	kernel<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,
	gpu_positiveIndex, gpu_negativeIndex,
	gpu_stateA, gpu_stateB, gpu_transitionsLastChain,gpu_bridge,gpu_stateSize);
	} else if (n < 65) {
	kernel2<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,
	gpu_positiveIndex, gpu_negativeIndex,
	gpu_stateA, gpu_stateB, gpu_transitionsLastChain,gpu_bridge,gpu_stateSize);
	} else if (n < 97) {
	kernel3<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,
	gpu_positiveIndex, gpu_negativeIndex,
	gpu_stateA, gpu_stateB, gpu_transitionsLastChain,gpu_bridge,gpu_stateSize);
	} else if (n < 129) {
	kernel4<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,
	gpu_positiveIndex, gpu_negativeIndex,
	gpu_stateA, gpu_stateB, gpu_transitionsLastChain,gpu_bridge,gpu_stateSize);
	}else if (n < 161) {
		kernel5<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,
		gpu_positiveIndex, gpu_negativeIndex,
		gpu_stateA, gpu_stateB, gpu_transitionsLastChain,gpu_bridge,gpu_stateSize);
	}else if (n < 193) {
		kernel6<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,
		gpu_positiveIndex, gpu_negativeIndex,
		gpu_stateA, gpu_stateB, gpu_transitionsLastChain,gpu_bridge,gpu_stateSize);
	}else if (n < 513) {
		kernel7<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,
		gpu_positiveIndex, gpu_negativeIndex,
		gpu_stateA, gpu_stateB, gpu_transitionsLastChain,gpu_bridge,gpu_stateSize);
	}else if (n < 2049) {
		kernel8<<<block,blockSize,size_sharedMemory>>>(states,gpu_n,gpu_nf,gpu_nv,gpu_cumNf,gpu_cumNv,gpu_F,gpu_varF,gpu_cij,gpu_p,gpu_initialState,gpu_steps,
		gpu_positiveIndex, gpu_negativeIndex,
		gpu_stateA, gpu_stateB, gpu_transitionsLastChain,gpu_bridge,gpu_stateSize);
	}
	HANDLE_ERROR(
	cudaMemcpy(stateA, gpu_stateA, N * sizeof(long), cudaMemcpyDeviceToHost));
	HANDLE_ERROR(
	cudaMemcpy(stateB, gpu_stateB, N * sizeof(long), cudaMemcpyDeviceToHost));
	HANDLE_ERROR(
	cudaMemcpy(transitionsLastChain, gpu_transitionsLastChain, 4 * N * sizeof(int),
	cudaMemcpyDeviceToHost));
	for (int i = 0; i < N; i++) {
	stateASum += stateA[i];
	stateBSum += stateB[i];
	transitionsLast[0][0] += transitionsLastChain[i * 4];
	transitionsLast[0][1] += transitionsLastChain[i * 4 + 1];
	transitionsLast[1][0] += transitionsLastChain[i * 4 + 2];
	transitionsLast[1][1] += transitionsLastChain[i * 4 + 3];
	}
	output << "stateASum=" << stateASum << " stateBSum=" << stateBSum << endl;
	output << "transitionsLast[0][0]=" << transitionsLast[0][0]

	<< ", transitionsLast[0][1]=" << transitionsLast[0][1]

	<< ", transitionsLast[1][0]=" << transitionsLast[1][0]
	<< ", transitionsLast[1][1]=" << transitionsLast[1][1] << endl;
	output << "Prob. is " << (float) stateASum / (float) (stateBSum + stateASum)
	<< "\n";
	calAlphaBeta(transitionsLast, alphabeta);
	output << "alpha=" << alphabeta[0] << ", beta=" << alphabeta[1] << "\n";
	TSN = ceil(
	alphabeta[0] * alphabeta[1] * (2 - alphabeta[0] - alphabeta[1])
	/ (pow(alphabeta[0] + alphabeta[1], 3) * pow(r / invPhi, 2))) * kstep;
	maxLength = transitionsLast[0][0] + transitionsLast[0][1]
	+ transitionsLast[1][0] + transitionsLast[1][1];
	round++;
	output << "maxLength=" << maxLength << ", N=" << TSN << "\n";
	}

	output << "Finished in " << round << " round. Prob. is "
	<< (float) stateASum / (float) (stateBSum + stateASum) << "\n";
	duration = (std::clock() - cpu_start) / (double) CLOCKS_PER_SEC;

	output << "time duration : " << duration << "s\n";

		// stop CUDA timer
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&memsettime, start, stop);
	output << " *** CUDA execution time: " << memsettime << " ms*** \n";
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	output.close();

	cudaFree(gpu_n);
	cudaFree(gpu_nf);
	cudaFree(gpu_nv);
	cudaFree(gpu_cumNf);
	cudaFree(gpu_cumNv);
	cudaFree(gpu_F);
	cudaFree(gpu_varF);
	cudaFree(gpu_cij);
	cudaFree(gpu_p);
	cudaFree(gpu_steps);
	cudaFree(gpu_positiveIndex);
	cudaFree(gpu_negativeIndex);
	cudaFree(gpu_stateA);
	cudaFree(gpu_stateB);
	cudaFree(gpu_transitionsLastChain);
	cudaFree(gpu_bridge);
	cudaFree(gpu_currentTrajectorySize);
	cudaFree(gpu_initialState);
	cudaFree(gpu_stateSize);
	cudaFree(gpu_trajectoryKernel);
	cudaFree(states);
	cudaFree(gpu_transitionsLastChain);
	cudaFree(gpu_stateA);
	cudaFree(gpu_stateB);
	cudaFree(gpu_bridge);

	//write result array
	jsize length=4;
	jdoubleArray newArray = env->NewDoubleArray(length);
	jdouble *narr = env->GetDoubleArrayElements(newArray, NULL);
	narr[0]=(float) stateASum / (float) (stateBSum + stateASum);
	narr[1]=stateBSum + stateASum+steps*N;//counting burn-in as well
	narr[2]=duration;
	narr[3]=memsettime/1000.0;
	env->ReleaseDoubleArrayElements(newArray, narr, NULL);
	return newArray;
  }
  
